Analysis of the Broad’s Avana CRISPR (Broad Institute Cancer Dependency Map 2018, Meyers et al. 2017) and the Broad and Dana-Farber Cancer Institite’s Achilles shRNA (MacFarland et al. 2018, Data Science 2018).
The Avana screen produced results using CERES (Meyers et al. 2017) (GitHub), which generates gene dependency scores from sgRNA depletion scores from gene essentiality screens and eliminates bias arising from the effect of copy number variation on Cas9 DNA cleavage. The lower the CERES score, the higher the likelihood that the gene is essential in the associated cell line. Scores are scaled per cell line such that a score of 0 is the median effect of nonessential genes and -1 is the median effect of common core essential genes.
All annotation files (copy number, mutation status, and gene expression) (Consortium and Consortium 2015, Barretina et al. 2012) were downloaded from the DepMap Data Portal.
Edited .Renviron to avoid “Error: memory exhausted (limit reached?)” when melting the GE dataset, as per the suggestion from StackOverflow
cd ~
touch .Renviron
open .Renviron
# Added following line to .Renviron:
R_MAX_VSIZE=100Gb
library(bsselectR)
library(cowplot)
theme_set(theme_light())
library(data.table)
library(ggpubr)
library(ggrepel)
library(ggsignif)
library(gplots)
library(gridExtra)
library(htmlwidgets)
library(kableExtra)
library(papaja)
library(plotly)
library(reshape2)
library(scales)
library(tidyverse)
library(VennDiagram)
# devtools::install_github("crsh/papaja")
adj_signif <- function(df, alpha) {
# Takes df from compare_means and creates a signif code column for FDR-adjusted p-vals for provided alpha
a <- alpha
i <- seq(along.with = df$p)
m <- nrow(df)
x <- sort(df$p) < i / m * a
if(length(unique(x)) == 2) {
k <- max(which(sort(df$p) < i/ m * a))
cutoff <- sort(df$p)[k]
}
else {
k <- NA
cutoff <- NA
}
df$p.signif.adj <- ifelse(df$p.adj <= cutoff, "*", NA)
df$p.signif <- ifelse(df$p.signif == "ns", NA, df$p.signif)
df$p.short <- formatC(df$p, format = "g", digits = 2)
df$p.adj.short <- formatC(df$p.adj, format = "g", digits = 2)
df$k <- k
df$FDR_cutoff <- cutoff
return(df)
}
# WilcoxonByDrugAllGenes <- function(melt_file, grid_file, data_name) {
# druglist <- as.character(unique(melt_file$Drug))
# sig_df <- NULL
# for(i in 1:length(druglist)) {
# print(paste0("Current drug in process: ", druglist[i], " (drug ", i, " of ", length(druglist), ")."))
# flush.console()
# melted <- filter(melt_file, Drug == druglist[i])
# use_data <- merge(grid_file, melted, by = "accession_id", all.x = TRUE)
# use_data <- merge(use_data, ccl_converter, by = "accession_id", all.x = TRUE)
# use_data <- merge(use_data, maf_df, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
# use_data$Mutation_Status_Nonsilent <- ifelse(is.na(use_data$Mutation_Status_Nonsilent), "Wildtype", use_data$Mutation_Status_Nonsilent)
# use_data$Hugo_Symbol <- factor(use_data$Hugo_Symbol)
# use_data$AUC <- as.numeric(use_data$AUC)
# use_data <- filter(use_data, !is.na(use_data$AUC))
# use_data$Drug_Gene <- paste0(use_data$Drug, "_", use_data$Hugo_Symbol)
# use_data_ptmuts <- filter(use_data, Reference_Allele_Length == 1 | Reference_Allele_Length == 0 | is.na(Reference_Allele_Length))
# use_data_ptmuts$Keep_PtMut_Tests <- ifelse(use_data_ptmuts$Reference_Allele_Length == 1 | is.na(use_data_ptmuts$Reference_Allele_Length), TRUE, ifelse(use_data_ptmuts$Tumor_Seq_Allele1_Length == 1, TRUE, FALSE))
# use_data_ptmuts <- filter(use_data_ptmuts, Keep_PtMut_Tests == TRUE)
#
# sig <- compare_means(AUC ~ Mutation_Status_Nonsilent, group.by = "Hugo_Symbol", data = use_data_ptmuts, method = "wilcox.test", p.adjust.method = "BH")
# sig <- adj_signif(sig)
# sig <- sig[order(sig$p),]
# sig <- filter(sig, p <= 0.05)
# sig$Drug <- druglist[i]
# sig$Dataset <- toupper(data_name)
# sig_df <- rbind(sig_df, sig)
# }
# return(sig_df)
# }
PearsonByLineage <- function(lin, dataset, variable) {
use_data <- filter(dataset, group_general_lineage_name == lin)
if(variable == "GE") {
use_data <- filter(use_data, !is.na(RPKM))
use_data_summ <- use_data %>% count(Drug_Gene)
use_data_summ <- filter(use_data_summ, n >= 3) # cor.test requires at least 3 non-NA data points
use_data_filt <- filter(use_data, Drug_Gene %in% as.character(unique(use_data_summ$Drug_Gene)))
if(nrow(use_data_filt) > 0) {
pearson_res <- use_data_filt %>% group_by(Drug_Gene) %>% summarize(Pearson_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$estimate, Pearson_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$p.value)
pearson_res <- merge(g2p_druggene_tally, pearson_res, all.y = TRUE)
pearson_res$group_general_lineage_name <- lin
pearson_res$Pearson_GE_Abs <- abs(pearson_res$Pearson_Corr_GE)
pearson_res$Pearson_GE_Sign <- ifelse(sign(pearson_res$Pearson_Corr_GE) == 1, "+",
ifelse(sign(pearson_res$Pearson_Corr_GE) == -1, "-", "0"))
return(pearson_res)
}
}
else if(variable == "CN") {
use_data <- filter(use_data, !is.na(CN))
use_data_summ <- use_data %>% count(Drug_Gene)
use_data_summ <- filter(use_data_summ, n >= 3) # cor.test requires at least 3 non-NA data points
use_data_filt <- filter(use_data, Drug_Gene %in% as.character(unique(use_data_summ$Drug_Gene)))
if(nrow(use_data_filt) > 0) {
pearson_res <- use_data_filt %>% group_by(Drug_Gene) %>% summarize(Pearson_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$estimate, Pearson_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$p.value)
pearson_res <- merge(g2p_druggene_tally, pearson_res, all.y = TRUE)
pearson_res$group_general_lineage_name <- lin
pearson_res$Pearson_CN_Abs <- abs(pearson_res$Pearson_Corr_CN)
pearson_res$Pearson_CN_Sign <- ifelse(sign(pearson_res$Pearson_Corr_CN) == 1, "+",
ifelse(sign(pearson_res$Pearson_Corr_CN) == -1, "-", "0"))
return(pearson_res)
}
}
}
saveWidgetFix <- function (widget , file, ...) {
## A wrapper to saveWidget which compensates for arguable BUG in
## saveWidget which requires `file` to be in current working
## directory.
wd <- getwd()
on.exit(setwd(wd))
outDir <- dirname(file)
file <- basename(file)
setwd(outDir);
saveWidget(widget, file = file,...)
}
MutPlotsByLineage <- function(lineage_name) {
use_data <- filter(dgl_mut_grid_summ, group_general_lineage_name == lineage_name)
use_data$Dataset <- factor(use_data$Dataset, levels = c("CCLE", "CTRP", "GDSC"))
ticker <- 0
use_data_ccle <- filter(use_data, Dataset == "CCLE")
use_data_ccle_summ <- use_data_ccle %>% count(InG2P)
ccle_lin_p_ttest <- data.frame(statistic = NA,
parameter = NA,
p.value = NA)
if(nrow(use_data_ccle_summ) <= 1) {
ccle_lin_p_ttest_res <- NA
ticker <- ticker + 1
}
else if(use_data_ccle_summ[use_data_ccle_summ$InG2P == "Yes",]$nn < 2) { ccle_lin_p_ttest_res <- NA }
else if(use_data_ccle_summ[use_data_ccle_summ$InG2P == "Yes",]$nn >= 2) {
ccle_lin_p_ttest <- t.test(use_data_ccle$p ~ use_data_ccle$InG2P)
ccle_lin_p_ttest_res <- paste0("t(", trunc(ccle_lin_p_ttest$parameter), ") = ", round(ccle_lin_p_ttest$statistic, 2), ", p = ", round(ccle_lin_p_ttest$p.value, 3))
}
use_data_ctrp <- filter(use_data, Dataset == "CTRP")
use_data_ctrp_summ <- use_data_ctrp %>% count(InG2P)
ctrp_lin_p_ttest <- data.frame(statistic = NA,
parameter = NA,
p.value = NA)
if(nrow(use_data_ctrp_summ) <= 1) {
ctrp_lin_p_ttest_res <- NA
ticker <- ticker + 1
}
else if(use_data_ctrp_summ[use_data_ctrp_summ$InG2P == "Yes",]$nn < 2) { ctrp_lin_p_ttest_res <- NA }
else if(use_data_ctrp_summ[use_data_ctrp_summ$InG2P == "Yes",]$nn >= 2) {
ctrp_lin_p_ttest <- t.test(use_data_ctrp$p ~ use_data_ctrp$InG2P)
ctrp_lin_p_ttest_res <- paste0("t(", trunc(ctrp_lin_p_ttest$parameter), ") = ", round(ctrp_lin_p_ttest$statistic, 2), ", p = ", round(ctrp_lin_p_ttest$p.value, 3))
}
use_data_gdsc <- filter(use_data, Dataset == "GDSC")
use_data_gdsc_summ <- use_data_gdsc %>% count(InG2P)
gdsc_lin_p_ttest <- data.frame(statistic = NA,
parameter = NA,
p.value = NA)
if(nrow(use_data_gdsc_summ) <= 1) {
gdsc_lin_p_ttest_res <- NA
ticker <- ticker + 1
}
else if(use_data_gdsc_summ[use_data_gdsc_summ$InG2P == "Yes",]$nn < 2) { gdsc_lin_p_ttest_res <- NA }
else if(use_data_gdsc_summ[use_data_gdsc_summ$InG2P == "Yes",]$nn >= 2) {
gdsc_lin_p_ttest <- t.test(use_data_gdsc$p ~ use_data_gdsc$InG2P)
gdsc_lin_p_ttest_res <- paste0("t(", trunc(gdsc_lin_p_ttest$parameter), ") = ", round(gdsc_lin_p_ttest$statistic, 2), ", p = ", round(gdsc_lin_p_ttest$p.value, 3))
}
y_value <- ifelse(ticker == 3, 10, max(use_data$n) + max(use_data$n) * 0.1)
summ_text <- data.frame(Dataset = c("CCLE", "CTRP", "GDSC"),
Test_Results_p = c(ccle_lin_p_ttest_res, ctrp_lin_p_ttest_res, gdsc_lin_p_ttest_res),
t_statistic = c(round(ccle_lin_p_ttest$statistic, 3), round(ctrp_lin_p_ttest$statistic, 3), round(gdsc_lin_p_ttest$statistic, 3)),
df = c(trunc(ccle_lin_p_ttest$parameter), trunc(ctrp_lin_p_ttest$parameter), trunc(gdsc_lin_p_ttest$parameter)),
p_value = c(round(ccle_lin_p_ttest$p.value, 3), round(ctrp_lin_p_ttest$p.value, 3), round(gdsc_lin_p_ttest$p.value, 3)),
group_general_lineage_name = lineage_name)
plot_point <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
coord_cartesian(xlim = c(0, 1), ylim = c(0, y_value)) +
scale_x_continuous(breaks = seq(0, 1, by = 0.1), labels = seq(0, 1, by = 0.1)) +
scale_y_continuous(breaks = pretty_breaks()) +
geom_point(mapping = aes(label = Drug_Gene, color = InG2P, x = p, y = n), alpha = 0.5) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
geom_text_repel(data = subset(use_data, InG2P == "Yes"), mapping = aes(x = p, y = n, label = Drug_Gene), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = summ_text, mapping = aes(x = 0.5, y = y_value, label = Test_Results_p), nudge_y = 0.1) +
labs(x = "Drug-gene association p-value between mutation groups", y = "Number of level A G2P associations") +
theme(legend.position = "none", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank())
plot_box <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
coord_cartesian(ylim = c(0, 1)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.1), labels = seq(0, 1, by = 0.1)) +
geom_violin(mapping = aes(color = InG2P, y = p, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = p, x = InG2P), alpha = 0.75, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?", title = toupper(lineage_name)) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot <- subplot(plot_box, plot_point, nrows = 2, titleX = TRUE, titleY = TRUE) %>% layout(title = toupper(lineage_name), margin = list(t = 75, l = 75), xaxis5 = list(title = "Gene-drug association p-value between mutation groups"), yaxis = list(title = "Level A G2P\nassociation", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
plot_point <- ggplot_gtable(ggplot_build(plot_point))
plot_box <- ggplot_gtable(ggplot_build(plot_box))
p_maxWidth <- grid::unit.pmax(plot_point$widths, plot_box$widths)
plot_point$widths <- as.list(p_maxWidth)
plot_box$widths <- as.list(p_maxWidth)
mut_plot <- cowplot::plot_grid(plot_box,
plot_point,
nrow = 2,
rel_heights = c(1, 4))
saveWidgetFix(widget = plot, file = paste0("./plots/manuscript/plotlys_mut_by_lineage_grid/", lineage_name, "_mut_plotly_grid.html"))
ggsave(paste0("./plots/manuscript/pngs_mut_by_lineage_grid/", lineage_name, "_mut_plot_grid.png"), mut_plot, device = "png", dpi = 450, width = 12, height = 6, units = "in")
return(summ_text)
}
PearsonPlotsByLineage <- function(lineage) {
print(paste0("Lineage: ", toupper(lineage)))
use_data <- filter(dgl_signif_pearson_g2p_grid, group_general_lineage_name == lineage)
use_data$Dataset <- factor(use_data$Dataset, levels = c("CCLE", "CTRP", "GDSC"))
use_data$Pearson_GE_Label <- paste0(round(use_data$Pearson_Pval_GE, 3), "\nDrug_Gene: ", use_data$Drug_Gene)
use_data$Pearson_CN_Label <- paste0(round(use_data$Pearson_Pval_CN, 3), "\nDrug_Gene: ", use_data$Drug_Gene)
ticker_ge <- 0
ticker_cn <- 0
ticker <- 0
use_data_ccle <- filter(use_data, Dataset == "CCLE")
use_data_ccle_ge_summ <- use_data_ccle[!is.na(use_data_ccle$Pearson_GE_Abs),] %>% count(InG2P)
use_data_ccle_cn_summ <- use_data_ccle[!is.na(use_data_ccle$Pearson_CN_Abs),] %>% count(InG2P)
ccle_ge_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
ccle_cn_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
if(nrow(use_data_ccle_ge_summ) <= 1) {
ccle_ge_res <- NA
ticker_ge <- ticker_ge + 1
}
else if(use_data_ccle_ge_summ[use_data_ccle_ge_summ$InG2P == "Yes",]$nn < 2) {
ccle_ge_res <- NA
}
else if(use_data_ccle_ge_summ[use_data_ccle_ge_summ$InG2P == "Yes",]$nn >= 2) {
ccle_ge_ttest <- t.test(use_data_ccle$Pearson_GE_Abs ~ use_data_ccle$InG2P)
ccle_ge_res <- paste0("t(", trunc(ccle_ge_ttest$parameter), ") = ", round(ccle_ge_ttest$statistic, 2), ", p = ", round(ccle_ge_ttest$p.value, 3))
}
if(nrow(use_data_ccle_cn_summ) <= 1) {
ccle_cn_res <- NA
ticker_cn <- ticker_cn + 1
}
else if(use_data_ccle_cn_summ[use_data_ccle_cn_summ$InG2P == "Yes",]$nn < 2) {
ccle_cn_res <- NA
}
else if(use_data_ccle_cn_summ[use_data_ccle_cn_summ$InG2P == "Yes",]$nn >= 2) {
ccle_cn_ttest <- t.test(use_data_ccle$Pearson_CN_Abs ~ use_data_ccle$InG2P)
ccle_cn_res <- paste0("t(", trunc(ccle_cn_ttest$parameter), ") = ", round(ccle_cn_ttest$statistic, 2), ", p = ", round(ccle_cn_ttest$p.value, 3))
}
print("CCLE testing: DONE")
use_data_ctrp <- filter(use_data, Dataset == "CTRP")
use_data_ctrp_ge_summ <- use_data_ctrp[!is.na(use_data_ctrp$Pearson_GE_Abs),] %>% count(InG2P)
use_data_ctrp_cn_summ <- use_data_ctrp[!is.na(use_data_ctrp$Pearson_CN_Abs),] %>% count(InG2P)
ctrp_ge_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
ctrp_cn_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
if(nrow(use_data_ctrp_ge_summ) <= 1) {
ctrp_ge_res <- NA
ticker_ge <- ticker_ge + 1
}
else if(use_data_ctrp_ge_summ[use_data_ctrp_ge_summ$InG2P == "Yes",]$nn < 2) {
ctrp_ge_res <- NA
}
else if(use_data_ctrp_ge_summ[use_data_ctrp_ge_summ$InG2P == "Yes",]$nn >= 2) {
ctrp_ge_ttest <- t.test(use_data_ctrp$Pearson_GE_Abs ~ use_data_ctrp$InG2P)
ctrp_ge_res <- paste0("t(", trunc(ctrp_ge_ttest$parameter), ") = ", round(ctrp_ge_ttest$statistic, 2), ", p = ", round(ctrp_ge_ttest$p.value, 3))
}
if(nrow(use_data_ctrp_cn_summ) <= 1) {
ctrp_cn_res <- NA
ticker_cn <- ticker_cn + 1
}
else if(use_data_ctrp_cn_summ[use_data_ctrp_cn_summ$InG2P == "Yes",]$nn < 2) {
ctrp_cn_res <- NA
}
else if(use_data_ctrp_cn_summ[use_data_ctrp_cn_summ$InG2P == "Yes",]$nn >= 2) {
ctrp_cn_ttest <- t.test(use_data_ctrp$Pearson_CN_Abs ~ use_data_ctrp$InG2P)
ctrp_cn_res <- paste0("t(", trunc(ctrp_cn_ttest$parameter), ") = ", round(ctrp_cn_ttest$statistic, 2), ", p = ", round(ctrp_cn_ttest$p.value, 3))
}
print("CTRP testing: DONE")
use_data_gdsc <- filter(use_data, Dataset == "GDSC")
use_data_gdsc_ge_summ <- use_data_gdsc[!is.na(use_data_gdsc$Pearson_GE_Abs),] %>% count(InG2P)
use_data_gdsc_cn_summ <- use_data_gdsc[!is.na(use_data_gdsc$Pearson_CN_Abs),] %>% count(InG2P)
gdsc_ge_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
gdsc_cn_ttest <- data.frame(statistic = NA, parameter = NA, p.value = NA)
if(nrow(use_data_gdsc_ge_summ) <= 1) {
gdsc_ge_res <- NA
ticker_ge <- ticker_ge + 1
}
else if(use_data_gdsc_ge_summ[use_data_gdsc_ge_summ$InG2P == "Yes",]$nn < 2) {
gdsc_ge_res <- NA
}
else if(use_data_gdsc_ge_summ[use_data_gdsc_ge_summ$InG2P == "Yes",]$nn >= 2) {
gdsc_ge_ttest <- t.test(use_data_gdsc$Pearson_GE_Abs ~ use_data_gdsc$InG2P)
gdsc_ge_res <- paste0("t(", trunc(gdsc_ge_ttest$parameter), ") = ", round(gdsc_ge_ttest$statistic, 2), ", p = ", round(gdsc_ge_ttest$p.value, 3))
}
if(nrow(use_data_gdsc_cn_summ) <= 1) {
gdsc_cn_res <- NA
ticker_cn <- ticker_cn + 1
}
else if(use_data_gdsc_cn_summ[use_data_gdsc_cn_summ$InG2P == "Yes",]$nn < 2) {
gdsc_cn_res <- NA
}
else if(use_data_gdsc_cn_summ[use_data_gdsc_cn_summ$InG2P == "Yes",]$nn >= 2) {
gdsc_cn_ttest <- t.test(use_data_gdsc$Pearson_CN_Abs ~ use_data_gdsc$InG2P)
gdsc_cn_res <- paste0("t(", trunc(gdsc_cn_ttest$parameter), ") = ", round(gdsc_cn_ttest$statistic, 2), ", p = ", round(gdsc_cn_ttest$p.value, 3))
}
print("GDSC testing: DONE")
y_value_ge <- ifelse(ticker_ge == 3, 10, max(use_data$n) + max(use_data$n) * 0.1)
y_value_cn <- ifelse(ticker_cn == 3, 10, max(use_data$n) + max(use_data$n) * 0.1)
summ_text <- data.frame(Dataset = c("CCLE", "CTRP", "GDSC"),
Test_Results_GE = c(ccle_ge_res, ctrp_ge_res, gdsc_ge_res),
t_statistic_GE = c(round(ccle_ge_ttest$statistic, 3), round(ctrp_ge_ttest$statistic, 3), round(gdsc_ge_ttest$statistic, 3)),
df_GE = c(trunc(ccle_ge_ttest$parameter), trunc(ctrp_ge_ttest$parameter), trunc(gdsc_ge_ttest$parameter)),
p_value_GE = c(round(ccle_ge_ttest$p.value, 3), round(ctrp_ge_ttest$p.value, 3), round(gdsc_ge_ttest$p.value, 3)),
Test_Results_CN = c(ccle_cn_res, ctrp_cn_res, gdsc_cn_res),
t_statistic_CN = c(round(ccle_cn_ttest$statistic, 3), round(ctrp_cn_ttest$statistic, 3), round(gdsc_cn_ttest$statistic, 3)),
df_CN = c(trunc(ccle_cn_ttest$parameter), trunc(ctrp_cn_ttest$parameter), trunc(gdsc_cn_ttest$parameter)),
p_value_CN = c(round(ccle_cn_ttest$p.value, 3), round(ctrp_cn_ttest$p.value, 3), round(gdsc_cn_ttest$p.value, 3)),
group_general_lineage_name = lineage)
## Gene expression
if(sum(!is.na(use_data$Pearson_GE_Abs)) > 0) {
plot_ge_point <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
scale_y_continuous(breaks = pretty_breaks()) +
geom_point(mapping = aes(label = Pearson_GE_Label, color = InG2P, fill = InG2P, shape = Pearson_GE_Sign, x = Pearson_GE_Abs, y = n), alpha = 0.5) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
geom_text_repel(data = subset(use_data, InG2P == "Yes" & n > 10), aes(label = Drug_Gene, x = Pearson_GE_Abs, y = n), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = summ_text, mapping = aes(x = 0.5, y = y_value_ge, label = Test_Results_GE), nudge_y = 1) +
labs(y = "Number of level A G2P associations", x = "Pearson correlation coefficient between drug-gene AUC and gene expression") +
theme(legend.position = "none", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank())
plot_ge_box <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
geom_violin(mapping = aes(color = InG2P, y = Pearson_GE_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_GE_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?", title = toupper(lineage)) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_ge <- subplot(plot_ge_box, plot_ge_point, nrows = 2, titleX = TRUE, titleY = TRUE) %>%
layout(title = lineage, margin = list(t = 75, l = 75),
xaxis5 = list(title = "Pearson correlation coefficient between drug-gene AUC and gene expression"),
yaxis = list(title = "In G2P?", domain = c(0.8, 1)),
yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
png_ge_point <- ggplot_gtable(ggplot_build(plot_ge_point))
png_ge_box <- ggplot_gtable(ggplot_build(plot_ge_box))
p_maxWidth <- grid::unit.pmax(png_ge_point$widths, png_ge_box$widths)
png_ge_point$widths <- as.list(p_maxWidth)
png_ge_box$widths <- as.list(p_maxWidth)
png_ge <- cowplot::plot_grid(png_ge_box, png_ge_point, nrow = 2, rel_heights = c(1, 4))
png_ge
saveWidgetFix(widget = plot_ge, file = paste0("./plots/manuscript/plotlys_ge_by_lineage_grid/", lineage, "_ge_plotly_grid.html"))
ggsave(paste0("./plots/manuscript/pngs_ge_by_lineage_grid/", lineage, "_ge_plot_grid.png"), png_ge, device = "png", dpi = 450, width = 12, height = 6, units = "in")
print("Gene expression plots: DONE")
}
else if(sum(!is.na(use_data$Pearson_GE_Abs)) == 0) { print("Gene expression plots: None") }
## Copy number
if(sum(!is.na(use_data$Pearson_CN_Abs)) > 0) {
plot_cn_point <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
scale_y_continuous(breaks = pretty_breaks()) +
geom_point(mapping = aes(label = Pearson_CN_Label, color = InG2P, fill = InG2P, shape = Pearson_CN_Sign, x = Pearson_CN_Abs, y = n), alpha = 0.5, position = position_jitter(width = 0, height = 0.15)) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
geom_text_repel(data = subset(use_data, InG2P == "Yes" & n > 10), aes(label = Drug_Gene, x = Pearson_CN_Abs, y = n), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = summ_text, mapping = aes(x = 0.5, y = y_value_cn, label = Test_Results_CN), nudge_y = 1) +
labs(y = "Number of level A G2P associations", x = "Pearson correlation coefficient between drug-gene AUC and copy number") +
theme(legend.position = "none", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank())
plot_cn_box <- ggplot(data = use_data) +
facet_wrap(~ Dataset, drop = FALSE) +
geom_violin(mapping = aes(color = InG2P, y = Pearson_CN_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_CN_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?", title = toupper(lineage)) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_cn <- subplot(plot_cn_box, plot_cn_point, nrows = 2, titleX = TRUE, titleY = TRUE) %>%
layout(title = lineage, margin = list(t = 75, l = 75),
xaxis5 = list(title = "Pearson correlation coefficient between drug-gene AUC and copy number"),
yaxis = list(title = "In G2P?", domain = c(0.8, 1)),
yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
png_cn_point <- ggplot_gtable(ggplot_build(plot_cn_point))
png_cn_box <- ggplot_gtable(ggplot_build(plot_cn_box))
p_maxWidth <- grid::unit.pmax(png_cn_point$widths, png_cn_box$widths)
png_cn_point$widths <- as.list(p_maxWidth)
png_cn_box$widths <- as.list(p_maxWidth)
png_cn <- cowplot::plot_grid(png_cn_box, png_cn_point, nrow = 2, rel_heights = c(1, 4))
png_cn
saveWidgetFix(widget = plot_cn, file = paste0("./plots/manuscript/plotlys_cn_by_lineage_grid/", lineage, "_cn_plotly_grid.html"))
ggsave(paste0("./plots/manuscript/pngs_cn_by_lineage_grid/", lineage, "_cn_plot_grid.png"), png_cn, device = "png", dpi = 450, width = 12, height = 6, units = "in")
print("Copy number plots: DONE")
}
else if(sum(!is.na(use_data$Pearson_CN_Abs)) == 0) { print("Copy number plots: None") }
return(summ_text)
}
makeDrugBoxplots <- function(drug, dataset) {
if(dataset == "CCLE" | dataset == "ccle") {
use_data <- filter(ccle_data_ptmuts, Drug == drug)
use_color <- as.character(use_data$Color_Nonsilent)
names(use_color) <- use_data$Mutation_Status_Nonsilent
sig <- ccle_signif_g2p_ByDrug[[drug]]
label_text <- data.frame(p.signif = sig$p.signif, p.signif.adj = sig$p.signif.adj, Hugo_Symbol = sig$Hugo_Symbol)
label_text$Hugo_Symbol <- factor(label_text$Hugo_Symbol, levels = g2p_genes)
}
else if(dataset == "CTRP" | dataset == "ctrp") {
use_data <- filter(ctrp_data_ptmuts, Drug == drug)
use_color <- as.character(use_data$Color_Nonsilent)
names(use_color) <- use_data$Mutation_Status_Nonsilent
sig <- ctrp_signif_g2p_ByDrug[[drug]]
label_text <- data.frame(p.signif = sig$p.signif, p.signif.adj = sig$p.signif.adj, Hugo_Symbol = sig$Hugo_Symbol)
label_text$Hugo_Symbol <- factor(label_text$Hugo_Symbol, levels = g2p_genes)
}
else if(dataset == "GDSC" | dataset == "gdsc") {
use_data <- filter(gdsc_data_ptmuts, Drug == drug)
use_color <- as.character(use_data$Color_Nonsilent)
names(use_color) <- use_data$Mutation_Status_Nonsilent
sig <- gdsc_signif_g2p_ByDrug[[drug]]
label_text <- data.frame(p.signif = sig$p.signif, p.signif.adj = sig$p.signif.adj, Hugo_Symbol = sig$Hugo_Symbol)
label_text$Hugo_Symbol <- factor(label_text$Hugo_Symbol, levels = g2p_genes)
}
else { return("Error: Invalid dataset.") }
plot <- ggplot(data = use_data, aes(x = Mutation_Status_Nonsilent, y = AUC)) +
facet_wrap(~ Hugo_Symbol, drop = FALSE, nrow = 1) +
geom_boxplot(mapping = aes(fill = Mutation_Status_Nonsilent), position = position_dodge(0.85), outlier.shape = 3, outlier.size = 0.5) +
scale_fill_manual(values = use_color) +
guides(color = FALSE) +
geom_text(data = label_text, mapping = aes(x = 1.5, y = max(use_data$AUC), label = p.signif), nudge_y = 0.2) +
theme(strip.text.x = element_text(size = 10, angle = 45), legend.position = "top", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(fill = "Mutation Status", y = "AUC", x = "Mutation Status", title = paste0(toupper(dataset), ": ", drug))
return(plot)
}
This comprehensive cancer cell line information was curated by both Daniel Charytonowicz and myself.
ccl_converter <- read.delim("./data_munging/cell_line_database_v1_20180911.tsv", row.names = 1, sep = "\t", header = TRUE)
lineage_converter <- unique(ccl_converter[, c("grouped_general_doid", "group_general_lineage_name")])
colnames(ccl_converter)[colnames(ccl_converter) == "Broad_ID"] <- "DepMap_ID"
ccl_info <- read.delim("./data_munging/18Q4/DepMap-2018q4-celllines_20181115.csv", sep = ",", header = TRUE, na.strings = c("", NA))
ccl_info$Primary.Disease <- gsub("\\\\", "", ccl_info$Primary.Disease)
ccl_info$Primary.Disease <- gsub("Ewings", "Ewing's", ccl_info$Primary.Disease)
# From figshare
crispr_meta <- read.delim("./data_munging/18Q4/sample_info_18Q4_crispr_20181115.csv", sep = ",", header = TRUE, na.strings = c("", NA))
colnames(crispr_meta)[7] <- "CCLE_Name"
For mutation calling, get paired gene name and cell line fields in a data frame. For this analysis, we don’t care about how many mutations there are per gene or what type of mutations there are, so I didn’t save more information. I took unique gene-cell line combinations since we only cared about mutation presence/absence. Add a Mutation_Status column denoting all entries in MAF files as mutations present in the associated cell lines.
Mutation calls (coding region, germline filtered):
maf_raw <- read.delim("./data_munging/18Q4/depmap_18Q4_mutation_calls_20181114.csv.gz", header = TRUE, sep = ",", row.names = 1)
colnames(maf_raw)[colnames(maf_raw) == "Tumor_Sample_Barcode"] <- "CCLE_Name"
# Select columns
maf_df <- subset(maf_raw, select = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID", "Variant_Classification", "Reference_Allele", "Tumor_Seq_Allele1", "Genome_Change"))
maf_df$Reference_Allele <- ifelse(maf_df$Reference_Allele == "-", 0, nchar(as.character(maf_df$Reference_Allele)))
maf_df$Tumor_Seq_Allele1 <- nchar(as.character(maf_df$Tumor_Seq_Allele1))
maf_df <- maf_df[maf_df$Variant_Classification != "Silent",]
# Add Mutation_Status column
maf_df$Mutation_Status_Nonsilent <- "Mutant"
saveRDS(maf_df, "./data_munging/rds/maf_df_18Q4.rds", compress = "xz")
DepMap WES CN Data:
cn <- read.delim("./data_munging/18Q4/public_18Q4_gene_cn_20181115.csv.gz", sep = ",", check.names = FALSE, header = TRUE)
# Remove Entrez gene IDs from colnames
# Format: Hugo_Symbol (Entrez ID)
colnames(cn) <- gsub(" .*", "", colnames(cn))
# Melt
cn_melt <- melt(data = cn, id.vars = "DepMap_ID", measure.vars = colnames(cn[2:ncol(cn)]), variable.name = "Hugo_Symbol", value.name = "CN")
saveRDS(cn_melt, "./data_munging/rds/cn_melt_18Q4.rds", compress = "xz")
18Q3, Gene expression data (Reads Per Kilobase of transcript, per Million mapped reads, RPKM).
ge_18Q3 <- read.delim("./../htscreens_giant_files/CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct.gz", skip = 2, header = TRUE, sep = "\t", check.names = FALSE)
# Edit columns
ge_18Q3$Name <- NULL
colnames(ge_18Q3)[1] <- "Hugo_Symbol"
# Melt
ge_melt_18Q3 <- melt(data = ge_18Q3, id.vars = "Hugo_Symbol", measure.vars = colnames(ge_18Q3[2:ncol(ge_18Q3)]), value.name = "RPKM")
## Split variable column
ge_melt_18Q3 <- with(ge_melt_18Q3, cbind(Hugo_Symbol, colsplit(variable, pattern = " ", names = c("CCLE_Name", "DepMap_ID")), RPKM))
## Remove parentheses around Broad IDs
ge_melt_18Q3$DepMap_ID <- gsub("\\(|\\)", "", ge_melt_18Q3$DepMap_ID)
saveRDS(ge_melt_18Q3, "./../htscreens_giant_files/ge_melt_18Q3.rds", compress = "xz")
18Q4, CCLE RNAseq gene expression data (log2(TPM+1)):
ge_18Q4 <- read.delim("./../htscreens_giant_files/CCLE_depMap_18Q4_TPM_v2_20181115.csv.gz", header = TRUE, sep = ",", check.names = FALSE)
# Remove Entrez gene IDs from colnames
# Format: Hugo_Symbol (Entrez ID)
colnames(ge_18Q4) <- gsub(" .*", "", colnames(ge_18Q4))
colnames(ge_18Q4)[1] <- "DepMap_ID"
# Melt
ge_melt_18Q4_all <- melt(ge_18Q4, id.vars = "DepMap_ID", measure.vars = colnames(ge_18Q4)[2:ncol(ge_18Q4)], variable.name = "Hugo_Symbol", value.name = "TPM")
ge_melt_18Q4 <- unique(ge_melt_18Q4)
saveRDS(ge_melt_18Q4, "./data_munging/rds/ge_melt_18Q4.rds", compress = "xz")
g2p <- read.delim("./data_munging/g2p_full_level_A_dataset_NEW_DEC_2_2018_withGroupedLineages.csv", sep = "\t", header = TRUE)
g2p[, 1:2] <- NULL
g2p <- merge(g2p, lineage_converter, by = "grouped_general_doid", all.x = TRUE)
g2p$Drug <- tolower(g2p$Drug)
g2p[g2p == -1] <- NA
g2p$Ref[g2p$Ref == "None"] <- NA
g2p$Alt[g2p$Alt == "None"] <- NA
g2p$Drug_Gene <- ifelse(is.na(g2p$Drug) | is.na(g2p$Gene), NA, paste0(g2p$Drug, "_", g2p$Gene))
g2p$Drug_Gene_Lin <- ifelse(is.na(g2p$Drug) | is.na(g2p$Gene) | is.na(g2p$group_general_lineage_name), NA, paste0(g2p$Drug, "_", g2p$Gene, "_", g2p$group_general_lineage_name))
g2p$position <- paste0("g.chr", g2p$Chromosome, ":", g2p$End, g2p$Ref, ">", g2p$Alt)
g2p$position <- ifelse(grepl("NA", g2p$position), NA, g2p$position)
g2p_filt <- subset(g2p, grepl("^CID", g2p$DrugID))
g2p_dgls <- unique(g2p_filt$Drug_Gene_Lin[!is.na(g2p_filt$Drug_Gene_Lin)])
g2p_druggenes <- unique(g2p_filt$Drug_Gene[!is.na(g2p_filt$Drug_Gene)])
g2p_drugs <- unique(sapply(strsplit(g2p_dgls, "_", fixed = TRUE), function(x) x[1]))
g2p_genes <- unique(sapply(strsplit(g2p_dgls, "_", fixed = TRUE), function(x) x[2]))
g2p_lins <- unique(sapply(strsplit(g2p_dgls, "_", fixed = TRUE), function(x) x[3]))
g2p_druggenes
## [1] "enasidenib_IDH2" "bosutinib monohydrate_ABL1"
## [3] "daunorubicin_NPM1" "ponatinib_ABL1"
## [5] "arsenic_PML" "imatinib_ABL1"
## [7] "dasatinib_ABL1" "nilotinib_ABL1"
## [9] "midostaurin_FLT3" "imatinib_PDGFRA"
## [11] "daunorubicin_DNMT3A" "tretinoin_PML"
## [13] "ceritinib_ALK" "crizotinib_ALK"
## [15] "gefitinib_EGFR" "afatinib_EGFR"
## [17] "afatinib_ERBB2" "erlotinib_EGFR"
## [19] "afatinib_KRAS" "icotinib_KRAS"
## [21] "erlotinib_KRAS" "crizotinib_MET"
## [23] "gefitinib_KRAS" "alectinib_ROS1"
## [25] "azd_EGFR" "alectinib_ALK"
## [27] "cabozantinib_RET" "brigatinib_ALK"
## [29] "osimertinib_EGFR" "trametinib_BRAF"
## [31] "afatinib_ERBB3" "ceritinib_ROS1"
## [33] "crizotinib_ROS1" "azd_KRAS"
## [35] "erlotinib_MET" "gefitinib_MET"
## [37] "vemurafenib_BRAF" "dabrafenib_BRAF"
## [39] "ap26113_ALK" "selumetinib_KRAS"
## [41] "lapatinib_ERBB2" "docetaxel_ERBB2"
## [43] "lapatinib_KRAS" "cyclophosphamide_ERBB2"
## [45] "neratinib_ERBB2" "palbociclib_ERBB2"
## [47] "anastrozole_ESR1" "fulvestrant_ESR1"
## [49] "letrozole_ESR1" "everolimus_ERBB2"
## [51] "fulvestrant_ERBB2" "anastrozole_CYP19A1"
## [53] "letrozole_ERBB2" "everolimus_AKT1"
## [55] "sunitinib_KIT" "purine_TPMT"
## [57] "capecitabine_DPYD" "ruxolitinib_JAK2"
## [59] "guanine_TPMT" "temozolomide_MGMT"
## [61] "tegafur_DPYD" "pralidoxime mesylate_KIT"
## [63] "everolimus_TSC1" "dabrafenib_G6PD"
## [65] "nilotinib_UGT1A1" "irinotecan_UGT1A1"
## [67] "imatinib_PDGFRB" "belinostat_UGT1A1"
## [69] "imatinib_KIT" "regorafenib_PDGFRA"
## [71] "2-fluoropyrimidine_DPYD" "everolimus_TSC2"
## [73] "fluorouracil_DPYD" "pazopanib_UGT1A1"
## [75] "cobimetinib_BRAF" "regorafenib_KIT"
## [77] "imatinib_PDGFB" "sunitinib_PDGFRA"
## [79] "olaparib_BRCA2" "ibrutinib_MYD88"
## [81] "olaparib_BRCA1" "nafomine malate_KIT"
## [83] "vandetanib_RET" "rucaparib_BRCA2"
## [85] "rucaparib_BRCA1" "niraparib_BRCA1"
## [87] "niraparib_BRCA2" "vismodegib_PTCH1"
g2p_drugs
## [1] "enasidenib" "bosutinib monohydrate"
## [3] "daunorubicin" "ponatinib"
## [5] "arsenic" "imatinib"
## [7] "dasatinib" "nilotinib"
## [9] "midostaurin" "tretinoin"
## [11] "ceritinib" "crizotinib"
## [13] "gefitinib" "afatinib"
## [15] "erlotinib" "icotinib"
## [17] "alectinib" "azd"
## [19] "cabozantinib" "brigatinib"
## [21] "osimertinib" "trametinib"
## [23] "vemurafenib" "dabrafenib"
## [25] "ap26113" "selumetinib"
## [27] "lapatinib" "docetaxel"
## [29] "cyclophosphamide" "neratinib"
## [31] "palbociclib" "anastrozole"
## [33] "fulvestrant" "letrozole"
## [35] "everolimus" "sunitinib"
## [37] "purine" "capecitabine"
## [39] "ruxolitinib" "guanine"
## [41] "temozolomide" "tegafur"
## [43] "pralidoxime mesylate" "irinotecan"
## [45] "belinostat" "regorafenib"
## [47] "2-fluoropyrimidine" "fluorouracil"
## [49] "pazopanib" "cobimetinib"
## [51] "olaparib" "ibrutinib"
## [53] "nafomine malate" "vandetanib"
## [55] "rucaparib" "niraparib"
## [57] "vismodegib"
g2p_genes
## [1] "IDH2" "ABL1" "NPM1" "PML" "FLT3" "PDGFRA" "DNMT3A"
## [8] "ALK" "EGFR" "ERBB2" "KRAS" "MET" "ROS1" "RET"
## [15] "BRAF" "ERBB3" "ESR1" "CYP19A1" "AKT1" "KIT" "TPMT"
## [22] "DPYD" "JAK2" "MGMT" "TSC1" "G6PD" "UGT1A1" "PDGFRB"
## [29] "TSC2" "PDGFB" "BRCA2" "MYD88" "BRCA1" "PTCH1"
g2p_lins
## [1] "leukemia" "lung cancer" "breast cancer" "cancer"
## [5] "thyroid cancer" "ovarian cancer" "skin cancer"
# Order and genes taken from pink heatmapo from Daniel's analyses
g2p_genes <- c("EGFR", "KRAS", "ERBB2", "ERBB3", "MET", "ALK", "ROS1", "PDGFRA", "ABL1", "KIT", "PDGFRB", "PDGFB", "BRAF", "G6PD", "RET", "BRCA1", "BRCA2", "UGT1A1", "TSC1", "TSC2", "AKT1", "IDH2", "DPYD", "ESR1", "CYP19A1", "TPMT", "FLT3", "NPM1", "DNMT3A", "PML", "PTCH1", "JAK2", "MYD88", "MGMT")
# levels(g2p_filt$Phenotype.Description)
unique(g2p_filt$group_general_lineage_name)
## [1] leukemia lung cancer breast cancer cancer
## [5] thyroid cancer ovarian cancer skin cancer
## 27 Levels: biliary tract cancer bone cancer breast cancer ... vaginal cancer
levels(g2p_filt$group_general_lineage_name)
## [1] "biliary tract cancer"
## [2] "bone cancer"
## [3] "breast cancer"
## [4] "cancer"
## [5] "central nervous system cancer"
## [6] "cervical cancer"
## [7] "colorectal cancer"
## [8] "esophageal cancer"
## [9] "head and neck cancer"
## [10] "kidney cancer"
## [11] "leukemia"
## [12] "liver cancer"
## [13] "lung cancer"
## [14] "lymphoma"
## [15] "malignant mesothelioma"
## [16] "multiple myeloma"
## [17] "ovarian cancer"
## [18] "pancreatic cancer"
## [19] "peripheral nervous system neoplasm"
## [20] "prostate cancer"
## [21] "rhabdomyosarcoma"
## [22] "skin cancer"
## [23] "stomach cancer"
## [24] "thyroid cancer"
## [25] "urinary bladder cancer"
## [26] "uterine cancer"
## [27] "vaginal cancer"
g2p_nonCID <- subset(g2p, !grepl("^CID", g2p$DrugID))
summ <- g2p_nonCID %>% group_by(Drug, DrugID) %>% tally() %>% summarize(DrugID = DrugID, n = n, percent = formatC(n / 402 * 100, digits = 6, format = "f"))
summ
## # A tibble: 11 x 4
## Drug DrugID n percent
## <chr> <fct> <int> <chr>
## 1 cetuximab CHEMBL1201577 74 18.407960
## 2 cisplatin CHEMBL2068237 3 0.746269
## 3 egfr SID160769799 1 0.248756
## 4 ipilimumab CHEMBL1789844 2 0.497512
## 5 mab CHEMBL2109423 16 3.980100
## 6 nivolumab CHEMBL2108738 4 0.995025
## 7 panitumumab CHEMBL1201827 75 18.656716
## 8 pembrolizumab CHEMBL3137343 3 0.746269
## 9 pertuzumab CHEMBL2007641 1 0.248756
## 10 trastuzumab CHEMBL1201585 9 2.238806
## 11 <NA> <NA> 214 53.233831
cn_melt <- readRDS("./data_munging/rds/cn_melt_18Q4.rds")
ge_melt_18Q3 <- readRDS("./../htscreens_giant_files/ge_melt_18Q3.rds")
# ge_melt_18Q4 <- readRDS("./data_munging/rds/ge_melt_18Q4.rds")
# maf_filt_dan <- filter(maf_df, Hugo_Symbol %in% g2p_genes)
# write.table(maf_filt_dan, file = "~/Desktop/maf_filt_dan_18Q4_2018112.csv", quote = FALSE, sep = ",", row.names = FALSE)
maf_df <- readRDS("./data_munging/rds/maf_df_18Q4.rds")
test <- filter(maf_df, Hugo_Symbol %in% g2p_genes) %>% count(Hugo_Symbol, CCLE_Name)
test <- test[order(test$n, decreasing = TRUE),]
2419/2716 gene-cell line combinations have more than one mutation in the MAF file. This is corrected in the point mutation filtering done below.
# Drug metadata
drug_meta <- read.delim("./data_munging/drug_id_drug_name_table_10_12_2018.tsv", sep = "\t", header = FALSE)
colnames(drug_meta) <- c("CID", "Drug")
drug_meta$Drug <- tolower(drug_meta$Drug)
maf_df_g2p <- filter(maf_df, Hugo_Symbol %in% g2p_genes)
ge_filt_18Q3 <- filter(ge_melt_18Q3, Hugo_Symbol %in% g2p_genes)
# ge_filt_18Q4 <- filter(ge_melt_18Q4, Hugo_Symbol %in% g2p_genes)
cn_filt <- filter(cn_melt, Hugo_Symbol %in% g2p_genes)
# CCLE
ccle <- read.delim("./data_munging/data_drug_auc_ccle_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
ccle <- merge(drug_meta, ccle, by = "CID")
ccle_melt <- melt(ccle, id.vars = c("CID", "Drug"), measure.vars = colnames(ccle)[3:ncol(ccle)], variable.name = "accession_id", value.name = "ActArea")
ccle_grid <- expand.grid("accession_id" = unique(ccle_melt$accession_id), "Hugo_Symbol" = g2p_genes)
ccle_data <- merge(ccle_grid, ccle_melt, by = "accession_id", all.x = TRUE)
ccle_data <- merge(ccle_data, ccl_converter, by = "accession_id", all.x = TRUE)
ccle_data <- merge(ccle_data, maf_df_g2p, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
ccle_data$Mutation_Status_Nonsilent <- ifelse(is.na(ccle_data$Mutation_Status_Nonsilent), "Wildtype", ccle_data$Mutation_Status_Nonsilent)
ccle_data <- merge(ccle_data, ge_filt_18Q3, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
# ccle_data <- merge(ccle_data, ge_filt_18Q4, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
ccle_data <- merge(ccle_data, cn_filt, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
ccle_data$Hugo_Symbol <- factor(ccle_data$Hugo_Symbol)
ccle_data$ActArea <- as.numeric(ccle_data$ActArea)
# CCLE AUC is actually AOC
# assuming scale is 0-8, transform to range between 0 and 1
# get AUC by 1 - AOC
ccle_data$AUC <- 1 - (ccle_data$ActArea / 8)
ccle_data <- filter(ccle_data, !is.na(ccle_data$AUC))
ccle_data$AUC_zscore <- (ccle_data$AUC - mean(ccle_data$AUC)) / sd(ccle_data$AUC)
ccle_data$Color_Nonsilent <- ifelse(ccle_data$Mutation_Status_Nonsilent == "Wildtype", "palegreen3", "springgreen4")
ccle_data$Color_Nonsilent <- factor(ccle_data$Color_Nonsilent)
ccle_data$Drug_Gene <- paste0(ccle_data$Drug, "_", ccle_data$Hugo_Symbol)
ccle_data$Drug_Gene_Lin <- paste0(ccle_data$Drug, "_", ccle_data$Hugo_Symbol, "_", ccle_data$group_general_lineage_name)
saveRDS(ccle_data, "./data_munging/rds/ccle_data_18Q4_g2pgenes.rds", compress = "xz")
# CTRP
ctrp <- read.delim("./data_munging/data_drug_auc_ctrp_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
ctrp <- merge(drug_meta, ctrp, by = "CID")
ctrp_melt <- melt(ctrp, id.vars = c("CID", "Drug"), measure.vars = colnames(ctrp)[3:ncol(ctrp)], variable.name = "accession_id", value.name = "AUC")
ctrp_grid <- expand.grid("accession_id" = unique(ctrp_melt$accession_id), "Hugo_Symbol" = g2p_genes)
ctrp_data <- merge(ctrp_grid, ctrp_melt, by = "accession_id", all.x = TRUE)
ctrp_data <- merge(ctrp_data, ccl_converter, by = "accession_id", all.x = TRUE)
ctrp_data <- merge(ctrp_data, maf_df_g2p, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
ctrp_data$Mutation_Status_Nonsilent <- ifelse(is.na(ctrp_data$Mutation_Status_Nonsilent), "Wildtype", ctrp_data$Mutation_Status_Nonsilent)
ctrp_data <- merge(ctrp_data, ge_filt_18Q3, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
# ctrp_data <- merge(ctrp_data, ge_filt_18Q4, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
ctrp_data <- merge(ctrp_data, cn_filt, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
ctrp_data$Hugo_Symbol <- factor(ctrp_data$Hugo_Symbol)
ctrp_data$AUC <- as.numeric(ctrp_data$AUC)
ctrp_data <- filter(ctrp_data, !is.na(ctrp_data$AUC))
ctrp_data$AUC_zscore <- (ctrp_data$AUC - mean(ctrp_data$AUC)) / sd(ctrp_data$AUC)
ctrp_data$Color_Nonsilent <- ifelse(ctrp_data$Mutation_Status_Nonsilent == "Wildtype", "slategray3", "steelblue4")
ctrp_data$Color_Nonsilent <- factor(ctrp_data$Color_Nonsilent)
ctrp_data$Drug_Gene <- paste0(ctrp_data$Drug, "_", ctrp_data$Hugo_Symbol)
ctrp_data$Drug_Gene_Lin <- paste0(ctrp_data$Drug, "_", ctrp_data$Hugo_Symbol, "_", ctrp_data$group_general_lineage_name)
saveRDS(ctrp_data, "./data_munging/rds/ctrp_data_18Q4_g2pgenes.rds", compress = "xz")
# GDSC
gdsc <- read.delim("./data_munging/data_drug_auc_gdsc_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
gdsc <- merge(drug_meta, gdsc, by = "CID")
gdsc_melt <- melt(gdsc, id.vars = c("CID", "Drug"), measure.vars = colnames(gdsc)[3:ncol(gdsc)], variable.name = "accession_id", value.name = "AUC")
gdsc_grid <- expand.grid("accession_id" = unique(gdsc_melt$accession_id), "Hugo_Symbol" = g2p_genes)
gdsc_data <- merge(gdsc_grid, gdsc_melt, by = "accession_id", all.x = TRUE)
gdsc_data <- merge(gdsc_data, ccl_converter, by = "accession_id", all.x = TRUE)
gdsc_data <- merge(gdsc_data, maf_df_g2p, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
gdsc_data$Mutation_Status_Nonsilent <- ifelse(is.na(gdsc_data$Mutation_Status_Nonsilent), "Wildtype", gdsc_data$Mutation_Status_Nonsilent)
gdsc_data <- merge(gdsc_data, ge_filt_18Q3, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
# gdsc_data <- merge(gdsc_data, ge_filt_18Q4, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
gdsc_data <- merge(gdsc_data, cn_filt, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
gdsc_data$Hugo_Symbol <- factor(gdsc_data$Hugo_Symbol)
gdsc_data$AUC <- as.numeric(gdsc_data$AUC)
gdsc_data <- filter(gdsc_data, !is.na(gdsc_data$AUC))
gdsc_data$AUC_zscore <- (gdsc_data$AUC - mean(gdsc_data$AUC)) / sd(gdsc_data$AUC)
gdsc_data$Color_Nonsilent <- ifelse(gdsc_data$Mutation_Status_Nonsilent == "Wildtype", "paleturquoise3", "turquoise4")
gdsc_data$Color_Nonsilent <- factor(gdsc_data$Color_Nonsilent)
gdsc_data$Drug_Gene <- paste0(gdsc_data$Drug, "_", gdsc_data$Hugo_Symbol)
gdsc_data$Drug_Gene_Lin <- paste0(gdsc_data$Drug, "_", gdsc_data$Hugo_Symbol, "_", gdsc_data$group_general_lineage_name)
saveRDS(gdsc_data, "./data_munging/rds/gdsc_data_18Q4_g2pgenes.rds", compress = "xz")
ccle_data <- readRDS("./data_munging/rds/ccle_data_18Q4_g2pgenes.rds")
ccle_data_ptmuts <- filter(ccle_data, Reference_Allele == 1 | Reference_Allele == 0 | is.na(Reference_Allele))
ccle_data_ptmuts$Keep_PtMut_Tests <- ifelse(ccle_data_ptmuts$Reference_Allele == 1 | is.na(ccle_data_ptmuts$Reference_Allele), TRUE, ifelse(ccle_data_ptmuts$Tumor_Seq_Allele1 == 1, TRUE, FALSE))
ccle_data_ptmuts <- filter(ccle_data_ptmuts, Keep_PtMut_Tests == TRUE)
# Remove columns that lead to duplicate mutation entries
ccle_data_ptmuts <- subset(ccle_data_ptmuts, select = -c(Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Genome_Change, Keep_PtMut_Tests))
ccle_data_ptmuts$Dataset <- "CCLE"
ccle_data_ptmuts$Hugo_Symbol <- ordered(ccle_data_ptmuts$Hugo_Symbol, levels = g2p_genes)
ccle_data_ptmuts <- unique(ccle_data_ptmuts)
saveRDS(ccle_data_ptmuts, "./data_munging/rds/ccle_data_18Q4_g2pgenes_ptmuts.rds", compress = "xz")
ctrp_data <- readRDS("./data_munging/rds/ctrp_data_18Q4_g2pgenes.rds")
ctrp_data_ptmuts <- filter(ctrp_data, Reference_Allele == 1 | Reference_Allele == 0 | is.na(Reference_Allele))
ctrp_data_ptmuts$Keep_PtMut_Tests <- ifelse(ctrp_data_ptmuts$Reference_Allele == 1 | is.na(ctrp_data_ptmuts$Reference_Allele), TRUE, ifelse(ctrp_data_ptmuts$Tumor_Seq_Allele1 == 1, TRUE, FALSE))
ctrp_data_ptmuts <- filter(ctrp_data_ptmuts, Keep_PtMut_Tests == TRUE)
# Remove columns that lead to duplicate mutation entries
ctrp_data_ptmuts <- subset(ctrp_data_ptmuts, select = -c(Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Genome_Change, Keep_PtMut_Tests))
ctrp_data_ptmuts$Dataset <- "CTRP"
ctrp_data_ptmuts$Hugo_Symbol <- ordered(ctrp_data_ptmuts$Hugo_Symbol, levels = g2p_genes)
ctrp_data_ptmuts <- unique(ctrp_data_ptmuts)
saveRDS(ctrp_data_ptmuts, "./data_munging/rds/ctrp_data_18Q4_g2pgenes_ptmuts.rds", compress = "xz")
gdsc_data <- readRDS("./data_munging/rds/gdsc_data_18Q4_g2pgenes.rds")
gdsc_data_ptmuts <- filter(gdsc_data, Reference_Allele == 1 | Reference_Allele == 0 | is.na(Reference_Allele))
gdsc_data_ptmuts$Keep_PtMut_Tests <- ifelse(gdsc_data_ptmuts$Reference_Allele == 1 | is.na(gdsc_data_ptmuts$Reference_Allele), TRUE, ifelse(gdsc_data_ptmuts$Tumor_Seq_Allele1 == 1, TRUE, FALSE))
gdsc_data_ptmuts <- filter(gdsc_data_ptmuts, Keep_PtMut_Tests == TRUE)
# Remove columns that lead to duplicate mutation entries
gdsc_data_ptmuts <- subset(gdsc_data_ptmuts, select = -c(Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Genome_Change, Keep_PtMut_Tests))
gdsc_data_ptmuts$Dataset <- "GDSC"
gdsc_data_ptmuts$Hugo_Symbol <- ordered(gdsc_data_ptmuts$Hugo_Symbol, levels = g2p_genes)
gdsc_data_ptmuts <- unique(gdsc_data_ptmuts)
saveRDS(gdsc_data_ptmuts, "./data_munging/rds/gdsc_data_18Q4_g2pgenes_ptmuts.rds", compress = "xz")
ccle_data <- readRDS("./data_munging/rds/ccle_data_18Q4_g2pgenes.rds")
ctrp_data <- readRDS("./data_munging/rds/ctrp_data_18Q4_g2pgenes.rds")
gdsc_data <- readRDS("./data_munging/rds/gdsc_data_18Q4_g2pgenes.rds")
ccle_data_ptmuts <- readRDS("./data_munging/rds/ccle_data_18Q4_g2pgenes_ptmuts.rds")
ctrp_data_ptmuts <- readRDS("./data_munging/rds/ctrp_data_18Q4_g2pgenes_ptmuts.rds")
gdsc_data_ptmuts <- readRDS("./data_munging/rds/gdsc_data_18Q4_g2pgenes_ptmuts.rds")
# Filter drug screen datasets for G2P drug-gene associations
ccle_data_g2p <- filter(ccle_data_ptmuts, Drug_Gene %in% g2p_druggenes)
ccle_data_g2p$Mutation_Status_Nonsilent <- factor(ccle_data_g2p$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
ctrp_data_g2p <- filter(ctrp_data_ptmuts, Drug_Gene %in% g2p_druggenes)
ctrp_data_g2p$Mutation_Status_Nonsilent <- factor(ctrp_data_g2p$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
gdsc_data_g2p <- filter(gdsc_data_ptmuts, Drug_Gene %in% g2p_druggenes)
gdsc_data_g2p$Mutation_Status_Nonsilent <- factor(gdsc_data_g2p$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
# Filter drug screen datasets for G2P drug-gene-lineage associations
ccle_data_g2p_dgl <- filter(ccle_data_ptmuts, Drug_Gene_Lin %in% g2p_dgls)
ccle_data_g2p_dgl$Mutation_Status_Nonsilent <- factor(ccle_data_g2p_dgl$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
ctrp_data_g2p_dgl <- filter(ctrp_data_ptmuts, Drug_Gene_Lin %in% g2p_dgls)
ctrp_data_g2p_dgl$Mutation_Status_Nonsilent <- factor(ctrp_data_g2p_dgl$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
gdsc_data_g2p_dgl <- filter(gdsc_data_ptmuts, Drug_Gene_Lin %in% g2p_dgls)
gdsc_data_g2p_dgl$Mutation_Status_Nonsilent <- factor(gdsc_data_g2p_dgl$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
# Filter drug screen datasets for G2P drugs and genes
ccle_data_g2p_grid <- filter(ccle_data_ptmuts, Drug %in% g2p_drugs)
ccle_data_g2p_grid$Mutation_Status_Nonsilent <- factor(ccle_data_g2p_grid$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
ctrp_data_g2p_grid <- filter(ctrp_data_ptmuts, Drug %in% g2p_drugs)
ctrp_data_g2p_grid$Mutation_Status_Nonsilent <- factor(ctrp_data_g2p_grid$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
gdsc_data_g2p_grid <- filter(gdsc_data_ptmuts, Drug %in% g2p_drugs)
gdsc_data_g2p_grid$Mutation_Status_Nonsilent <- factor(gdsc_data_g2p_grid$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
# Testing
test_ccle <- ccle_data_ptmuts %>% group_by(Hugo_Symbol, Mutation_Status_Nonsilent) %>% tally()
test_ctrp <- ctrp_data_ptmuts %>% group_by(Hugo_Symbol, Mutation_Status_Nonsilent) %>% tally()
test_gdsc <- gdsc_data_ptmuts %>% group_by(Hugo_Symbol, Mutation_Status_Nonsilent) %>% tally()
# drug_summ <- data.frame(Drug = g2p_drugs, Dataset_Count = 0)
How many datasets screen each G2P drug?
drug_summ <- data.frame(Drug = character(), Dataset_Count = integer(), stringsAsFactors = FALSE)
for(drug in g2p_drugs) {
counter = 0
if(drug %in% as.character(unique(ccle_data_g2p$Drug))) { counter <- counter + 1 }
if(drug %in% as.character(unique(ctrp_data_g2p$Drug))) { counter <- counter + 1 }
if(drug %in% as.character(unique(gdsc_data_g2p$Drug))) { counter <- counter + 1 }
drug_summ <- rbind(drug_summ, data.frame(Drug = drug, Dataset_Count = counter))
}
dep <- read.delim("./data_munging/18Q4/gene_dependency_18Q4_20181115.csv.gz", sep = ",", header = TRUE, check.names = FALSE)
# Remove Entrez gene IDs from colnames
colnames(dep) <- gsub(" .*", "", colnames(dep))
The latest CRISPR CERES score data (18Q3, August 2018) was pulled from the DepMap Data Portal (Broad Institute Cancer Dependency Map 2018, Meyers et al. 2017).
crispr <- read.delim("./data_munging/18Q4/gene_effect_18Q4_20181115.csv.gz", sep = ",", header = TRUE, check.names = FALSE)
# Remove Entrez gene IDs from colnames
colnames(crispr) <- gsub(" .*", "", colnames(crispr))
Merge annotation data:
# Melt CRISPR dataset for merging
crispr_melt <- melt(crispr, id.vars = "DepMap_ID", measure.vars = colnames(crispr)[2:ncol(crispr)], variable.name = "Hugo_Symbol", value.name = "Score")
# Melt dependency probabilities dataset for merging
dep_melt <- melt(dep, id.vars = "DepMap_ID", measure.vars = colnames(dep)[2:ncol(dep)], variable.name = "Hugo_Symbol", value.name = "Dep_Prob")
# Merge dependency probabilities
crispr_melt <- merge(crispr_melt, dep_melt, by = c("DepMap_ID", "Hugo_Symbol"), all.x = TRUE)
# Merge cell line metadata
crispr_melt <- merge(crispr_melt, ccl_info, by = "DepMap_ID", all.x = TRUE)
crispr_melt <- merge(crispr_melt, crispr_meta, by = c("CCLE_Name", "DepMap_ID"), all.x = TRUE)
# Merge mutation annotations
maf_filt_crispr <- filter(maf_df, Hugo_Symbol %in% as.character(unique(crispr_melt$Hugo_Symbol)))
crispr_muts <- merge(crispr_melt, maf_filt_crispr, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
crispr_muts$Hugo_Symbol <- factor(crispr_muts$Hugo_Symbol)
crispr_muts <- crispr_muts %>% mutate(Mutation_Status_Nonsilent = if_else(is.na(Mutation_Status_Nonsilent), "Wildtype", Mutation_Status_Nonsilent))
# Summarize number of mutant and Wildtype cell lines
crispr_muts_summ <- crispr_muts %>% group_by(Hugo_Symbol) %>%
summarize(N_Nonsilent_Wildtype = sum(Mutation_Status_Nonsilent == "Wildtype"),
N_Nonsilent_Mutant = sum(Mutation_Status_Nonsilent == "Mutant"))
# Merge test results back into full dataset, which restores information lost in the summarization
crispr_data <- merge(crispr_muts_summ, crispr_muts, by = "Hugo_Symbol")
# Add Color columns
crispr_data$Color_Nonsilent <- ifelse(crispr_data$Mutation_Status_Nonsilent == "Wildtype", "thistle", "darkorchid")
crispr_data$Color_Nonsilent <- factor(crispr_data$Color_Nonsilent)
# Cell line lineages
crispr_data <- merge(crispr_data, ccl_converter, by = c("CCLE_Name", "DepMap_ID"), all.x = TRUE)
levels(crispr_data$lineage_name) <- sort(levels(crispr_data$lineage_name), decreasing = TRUE)
# Copy number
crispr_data <- merge(crispr_data, cn_melt, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
# Gene expression (RPKM)
ge_crispr_18Q3 <- filter(ge_melt_18Q3, Hugo_Symbol %in% as.character(unique(crispr_melt$Hugo_Symbol)))
# ge_crispr_18Q4 <- filter(ge_melt_18Q4, Hugo_Symbol %in% as.character(unique(crispr_melt$Hugo_Symbol)))
crispr_data <- merge(crispr_data, ge_crispr_18Q3, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
# crispr_data <- merge(crispr_data, ge_crispr_18Q4, by = c("Hugo_Symbol", "DepMap_ID"), all.x = TRUE)
saveRDS(crispr_data, "./../htscreens_giant_files/crispr_data_18Q4.rds", compress = "xz")
# Takes a really long time to run (4+ hours)
# crispr_data_ptmuts <- filter(crispr_data, Reference_Allele == 1 | Reference_Allele == 0 | is.na(Reference_Allele))
# crispr_data_ptmuts$Keep_PtMut_Tests <- ifelse(crispr_data_ptmuts$Reference_Allele == 1 | is.na(crispr_data_ptmuts$Reference_Allele), TRUE, ifelse(crispr_data_ptmuts$Tumor_Seq_Allele1 == 1, TRUE, FALSE))
# crispr_data_ptmuts <- filter(crispr_data_ptmuts, Keep_PtMut_Tests == TRUE)
# crispr_data_ptmuts <- subset(crispr_data_ptmuts, select = -c(Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Genome_Change, Keep_PtMut_Tests))
# crispr_data_ptmuts <- unique(crispr_data_ptmuts)
# saveRDS(crispr_data_ptmuts, "./data_munging/rds/crispr_data_18Q4_ptmuts.rds", compress = "xz")
crispr_data <- readRDS("./../htscreens_giant_files/crispr_data_18Q4.rds")
# Run next two lines only if the file was made (see end of the last chunk)
# crispr_data_ptmuts <- readRDS("./../htscreens_giant_files/crispr_data_18Q4_ptmuts.rds")
# crispr_data_g2p <- filter(crispr_data_ptmuts, Hugo_Symbol %in% g2p_genes)
# Filter CRISPR for G2P genes in G2P
crispr_data_ptmuts <- filter(crispr_data, Hugo_Symbol %in% g2p_genes)
# Filter G2P CRISPR for point mutations
crispr_data_ptmuts <- filter(crispr_data_ptmuts, Reference_Allele == 1 | Reference_Allele == 0 | is.na(Reference_Allele))
crispr_data_ptmuts$Keep_PtMut_Tests <- ifelse(crispr_data_ptmuts$Reference_Allele == 1 | is.na(crispr_data_ptmuts$Reference_Allele), TRUE, ifelse(crispr_data_ptmuts$Tumor_Seq_Allele1 == 1, TRUE, FALSE))
crispr_data_ptmuts <- filter(crispr_data_ptmuts, Keep_PtMut_Tests == TRUE)
crispr_data_ptmuts <- subset(crispr_data_ptmuts, select = -c(Variant_Classification, Reference_Allele, Tumor_Seq_Allele1, Genome_Change, Keep_PtMut_Tests))
crispr_data_g2p <- unique(crispr_data_ptmuts)
crispr_data_g2p$Hugo_Symbol <- factor(crispr_data_g2p$Hugo_Symbol, levels = g2p_genes)
crispr_data_g2p$Mutation_Status_Nonsilent <- factor(crispr_data_g2p$Mutation_Status_Nonsilent, levels = c("Mutant", "Wildtype"))
Wilcoxon tests for assessing effect of mutation status on gene essentiality scores (CRISPR) and AUC (CCLE, CTRP, and GDSC).
Test G2P genes in lineage-agnostic context:
crispr_signif_g2p_gene <- compare_means(Score ~ Mutation_Status_Nonsilent, group.by = c("Hugo_Symbol"), data = crispr_data_g2p, method = "wilcox.test", p.adjust.method = "BH")
crispr_signif_g2p_gene <- adj_signif(crispr_signif_g2p_gene, alpha = 0.05)
crispr_signif_g2p_gene <- crispr_signif_g2p_gene[order(crispr_signif_g2p_gene$p),]
saveRDS(crispr_signif_g2p_gene, "./data_munging/rds/crispr_signif_g2p_gene.rds")
crispr_signif_g2p_gene <- readRDS("./data_munging/rds/crispr_signif_g2p_gene.rds")
crispr_signif_g2p_gene <- rbind(crispr_signif_g2p_gene, c("G6PD", rep(NA, 11)))
crispr_signif_g2p_gene_kable <- knitr::kable(crispr_signif_g2p_gene[, c("Hugo_Symbol", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "k", "FDR_cutoff")], caption = "Wilcoxon test results for Level A point mutations (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Plot results of crispr_signif_g2p_gene tests:
crispr_data_signif_g2p_gene_filt <- filter(crispr_data_ptmuts, Hugo_Symbol %in% as.character(crispr_signif_g2p_gene$Hugo_Symbol))
crispr_data_signif_g2p_gene <- merge(crispr_data_signif_g2p_gene_filt, crispr_signif_g2p_gene, by = "Hugo_Symbol")
crispr_data_signif_g2p_gene$Hugo_Symbol <- ordered(crispr_data_signif_g2p_gene$Hugo_Symbol, levels = as.character(crispr_signif_g2p_gene$Hugo_Symbol))
crispr_data_signif_g2p_gene <- merge(crispr_data_signif_g2p_gene, crispr_data_signif_g2p_gene %>% group_by(Hugo_Symbol) %>% summarize(Text_y = max(Score)), by = "Hugo_Symbol")
crispr_data_signif_g2p_gene_text <- unique(data.frame(p.signif = crispr_data_signif_g2p_gene$p.signif, p.signif.adj = crispr_data_signif_g2p_gene$p.signif.adj, Hugo_Symbol = crispr_data_signif_g2p_gene$Hugo_Symbol, Text_y = crispr_data_signif_g2p_gene$Text_y))
crispr_data_signif_g2p_gene_plot <- ggplot(data = crispr_data_signif_g2p_gene, aes(x = Mutation_Status_Nonsilent, y = Score)) +
facet_wrap(~ Hugo_Symbol, drop = FALSE, nrow = 2, scales = "free_y") +
geom_jitter(aes(color = Mutation_Status_Nonsilent), alpha = 0.5, width = 0.3) +
geom_boxplot(color = "black", fill = NA, outlier.shape = NA) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
scale_color_manual(values = c("Mutant" = "darkorchid", "Wildtype" = "thistle")) +
geom_text(data = crispr_data_signif_g2p_gene_text, mapping = aes(x = 1.5, y = Text_y, label = p.signif), nudge_y = 0.2) +
theme(legend.position = "top", axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) +
labs(y = "CERES Score",
color = "Mutation Status",
title = "CERES scores for select Level A G2P genes",
subtitle = "Genes are sorted by increasing p-value.")
crispr_data_signif_g2p_gene_plot
# ggsave("./plots/manuscript/crispr_data_signif_g2p_gene_plot.png", crispr_data_signif_g2p_gene_plot, width = 12, height = 7, units = "in")
Test G2P genes in lineage-grouped context:
crispr_signif_g2p_lineage <- compare_means(Score ~ Mutation_Status_Nonsilent, group.by = c("Hugo_Symbol", "group_general_lineage_name"), data = crispr_data_g2p, method = "wilcox.test", p.adjust.method = "BH")
crispr_signif_g2p_lineage <- adj_signif(crispr_signif_g2p_lineage, alpha = 0.05)
crispr_signif_g2p_lineage <- crispr_signif_g2p_lineage[order(crispr_signif_g2p_lineage$p),]
saveRDS(crispr_signif_g2p_lineage, "./data_munging/rds/crispr_signif_g2p_lineage.rds")
# write.table(crispr_signif_g2p_lineage, file = "~/Desktop/crispr_signif_g2p_lineage.csv", quote = FALSE, sep = ",", row.names = FALSE)
crispr_signif_g2p_lineage <- readRDS("./data_munging/rds/crispr_signif_g2p_lineage.rds")
crispr_signif_g2p_lineage_kable <- knitr::kable(filter(crispr_signif_g2p_lineage, p < 0.1)[, c("Hugo_Symbol", "group_general_lineage_name", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "Wilcoxon test results comparing non-silent mutant vs wildtype cell lines by lineage, p < 0.1 (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Plot results of crispr_signif_g2p_lineage tests:
crispr_data_signif_g2p_lineage <- merge(crispr_data_signif_g2p_gene_filt, crispr_signif_g2p_lineage, by = c("Hugo_Symbol", "group_general_lineage_name"))
crispr_data_signif_g2p_lineage$Hugo_Symbol <- ordered(crispr_data_signif_g2p_lineage$Hugo_Symbol, levels = as.character(unique((crispr_signif_g2p_lineage$Hugo_Symbol))))
crispr_data_signif_g2p_lineage <- merge(crispr_data_signif_g2p_lineage, crispr_data_signif_g2p_lineage %>% group_by(Hugo_Symbol) %>% summarize(Text_y = max(Score)), by = "Hugo_Symbol")
crispr_data_signif_g2p_lineage_text <- unique(data.frame(p.signif = crispr_data_signif_g2p_lineage$p.signif, p.signif.adj = crispr_data_signif_g2p_lineage$p.signif.adj, group_general_lineage_name = crispr_data_signif_g2p_lineage$group_general_lineage_name, Hugo_Symbol = crispr_data_signif_g2p_lineage$Hugo_Symbol, Text_y = crispr_data_signif_g2p_lineage$Text_y))
crispr_data_signif_g2p_lineage_plot <- ggplot(data = crispr_data_signif_g2p_lineage, aes(x = group_general_lineage_name, y = Score)) +
facet_wrap(~ Hugo_Symbol, drop = FALSE, nrow = 6, scales = "free_y") +
geom_point(aes(color = Mutation_Status_Nonsilent), alpha = 0.5) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
scale_color_manual(values = c("Mutant" = "darkorchid", "Wildtype" = "thistle")) +
geom_text(data = crispr_data_signif_g2p_lineage_text, mapping = aes(x = group_general_lineage_name, y = Text_y, label = p.signif), angle = 90, vjust = 0.8, nudge_y = 0.1) +
theme(legend.position = "top", axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(y = "CERES Score",
x = "Lineage",
color = "Mutation Status",
title = "CERES score by cell line lineage for significant Level A G2P genes",
subtitle = "* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001\nGenes are sorted by increasing lineage-agnostic p-value.")
crispr_data_signif_g2p_lineage_plot
# write.table(crispr_data_signif_g2p_lineage, file = "~/Desktop/crispr_data_signif_g2p_lineage_20181213.tsv", quote = FALSE, sep = "\t")
# ggsave("./plots/manuscript/crispr_data_signif_g2p_lineage_plot.png", crispr_data_signif_g2p_lineage_plot, width = 15, height = 18, units = "in")
Count mutant/wildtype calls in each gene-lineage group:
crispr_data_signif_g2p_lineage_summ <- crispr_data_signif_g2p_lineage %>% group_by(Hugo_Symbol, group_general_lineage_name, Mutation_Status_Nonsilent) %>% tally()
# write.table(crispr_data_signif_g2p_lineage_summ, file = "~/Desktop/crispr_data_signif_g2p_lineage_summ_20181005.tsv", quote = FALSE, sep = "\t")
crispr_data_signif_g2p_lineage_summ_kable <- knitr::kable(crispr_data_signif_g2p_lineage_summ, caption = "Mutant/Wildtype counts for gene-lineage combinations") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Drug metadata:
drug_meta <- read.delim("./data_munging/drug_id_drug_name_table_10_12_2018.tsv", sep = "\t", header = FALSE)
colnames(drug_meta) <- c("CID", "Drug")
drug_meta$Drug <- tolower(drug_meta$Drug)
Test all CRISPR genes in lineage-agnostic context:
# ccle <- read.delim("./data_munging/data_drug_auc_ccle_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
# ccle <- merge(drug_meta, ccle, by = "CID")
# ccle_melt <- melt(ccle, id.vars = c("CID", "Drug"), measure.vars = colnames(ccle)[3:ncol(ccle)], variable.name = "accession_id", value.name = "AUC")
# ccle_grid <- expand.grid("accession_id" = unique(ccle_melt$accession_id), "Hugo_Symbol" = as.character(unique(crispr_data$Hugo_Symbol)))
# system.time({ ccle_data_allgenes <- WilcoxonByDrugAllGenes(ccle_melt, ccle_grid, "CCLE") })
# saveRDS(ccle_data_allgenes, "./data_munging/rds/ccle_data_18Q3_allgenes.rds", compress = "xz")
# user system elapsed
# 1483.399 262.649 1901.698
ccle_data_allgenes <- readRDS("./data_munging/rds/ccle_data_18Q3_allgenes.rds")
ccle_data_allgenes <- ccle_data_allgenes[order(ccle_data_allgenes$p),]
ccle_data_allgenes_kable <- knitr::kable(ccle_data_allgenes[1:1000, c("Hugo_Symbol", "Drug", "p", "p.adj", "p.format", "p.signif", "p.signif.adj")], caption = "CCLE: by-drug Wilcoxon test results for point mutations in genes targeted in CRISPR screen (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-agnostic context:
# G2P drug-gene associations only
ccle_signif_g2p <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene", data = ccle_data_g2p, method = "wilcox.test", p.adjust.method = "BH")
ccle_signif_g2p <- adj_signif(ccle_signif_g2p, alpha = 0.05)
ccle_signif_g2p <- ccle_signif_g2p[order(ccle_signif_g2p$p),]
saveRDS(ccle_signif_g2p, "./data_munging/rds/ccle_signif_g2p_gene.rds")
# Grid comparisons of G2P drugs and genes
ccle_signif_g2p_grid <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = c("Drug_Gene"), data = ccle_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
ccle_signif_g2p_grid <- adj_signif(ccle_signif_g2p_grid, alpha = 0.05)
ccle_signif_g2p_grid <- ccle_signif_g2p_grid[order(ccle_signif_g2p_grid$p),]
saveRDS(ccle_signif_g2p_grid, "./data_munging/rds/ccle_signif_g2p_grid.rds")
ccle_signif_g2p <- readRDS("./data_munging/rds/ccle_signif_g2p_gene.rds")
ccle_signif_g2p_kable <- knitr::kable(ccle_signif_g2p[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CCLE: Wilcoxon test results for level A G2P drug-gene associations (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
ccle_signif_g2p_grid <- readRDS("./data_munging/rds/ccle_signif_g2p_grid.rds")
ccle_signif_g2p_grid_kable <- knitr::kable(ccle_signif_g2p_grid[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CCLE: Wilcoxon test results for point mutations in level A G2P genes and drugs (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-grouped context:
ccle_signif_grid_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = ccle_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
ccle_signif_grid_lineage <- adj_signif(ccle_signif_grid_lineage, alpha = 0.05)
ccle_signif_grid_lineage <- ccle_signif_grid_lineage[order(ccle_signif_grid_lineage$p),]
saveRDS(ccle_signif_grid_lineage, "./data_munging/rds/ccle_signif_grid_lineage.rds")
ccle_signif_g2p_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = ccle_data_g2p_dgl, method = "wilcox.test", p.adjust.method = "BH")
ccle_signif_g2p_lineage <- adj_signif(ccle_signif_g2p_lineage, alpha = 0.05)
ccle_signif_g2p_lineage <- ccle_signif_g2p_lineage[order(ccle_signif_g2p_lineage$p),]
saveRDS(ccle_signif_g2p_lineage, "./data_munging/rds/ccle_signif_g2p_lineage.rds")
ccle_signif_g2p_lineage <- readRDS("./data_munging/rds/ccle_signif_g2p_lineage.rds")
ccle_signif_grid_lineage <- readRDS("./data_munging/rds/ccle_signif_grid_lineage.rds")
ccle_signif_g2p_lineage_kable <- knitr::kable(ccle_signif_g2p_lineage[, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CCLE: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
ccle_signif_grid_lineage_kable <- knitr::kable(ccle_signif_grid_lineage[1:500, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CCLE: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test all CRISPR genes in lineage-agnostic context:
# ctrp <- read.delim("./data_munging/data_drug_auc_ctrp_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
# ctrp <- merge(drug_meta, ctrp, by = "CID")
# ctrp_melt <- melt(ctrp, id.var s = c("CID", "Drug"), measure.vars = colnames(ctrp)[3:ncol(ctrp)], variable.name = "accession_id", value.name = "AUC")
# ctrp_grid <- expand.grid("accession_id" = unique(ctrp_melt$accession_id), "Hugo_Symbol" = as.character(unique(crispr_data$Hugo_Symbol)))
# ctrp_data_allgenes <- WilcoxonByDrugAllGenes(ctrp_melt, ctrp_grid, "CTRP")
# saveRDS(ctrp_data_allgenes, "./data_munging/rds/ctrp_data_18Q3_allgenes.rds", compress = "xz")
ctrp_data_allgenes <- readRDS("./data_munging/rds/ctrp_data_18Q3_allgenes.rds")
ctrp_data_allgenes <- ctrp_data_allgenes[order(ctrp_data_allgenes$p),]
ctrp_data_allgenes_kable <- knitr::kable(ctrp_data_allgenes[1:1000, c("Hugo_Symbol", "Drug", "p", "p.adj", "p.format", "p.signif", "p.signif.adj")], caption = "CTRP: by-drug Wilcoxon test results for point mutations in genes targeted in CRISPR screen (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-agnostic context:
ctrp_signif_g2p <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene", data = ctrp_data_g2p, method = "wilcox.test", p.adjust.method = "BH")
ctrp_signif_g2p <- adj_signif(ctrp_signif_g2p, alpha = 0.05)
ctrp_signif_g2p <- ctrp_signif_g2p[order(ctrp_signif_g2p$p),]
saveRDS(ctrp_signif_g2p, "./data_munging/rds/ctrp_signif_g2p_gene.rds")
# Grid comparisons of G2P drugs and genes
ctrp_signif_g2p_grid <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene", data = ctrp_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
ctrp_signif_g2p_grid <- adj_signif(ctrp_signif_g2p_grid, alpha = 0.05)
ctrp_signif_g2p_grid <- ctrp_signif_g2p_grid[order(ctrp_signif_g2p_grid$p),]
saveRDS(ctrp_signif_g2p_grid, "./data_munging/rds/ctrp_signif_g2p_grid.rds")
ctrp_signif_g2p <- readRDS("./data_munging/rds/ctrp_signif_g2p_gene.rds")
ctrp_signif_g2p_kable <- knitr::kable(ctrp_signif_g2p[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CTRP: Wilcoxon test results for level A G2P drug-gene associations (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
ctrp_signif_g2p_grid <- readRDS("./data_munging/rds/ctrp_signif_g2p_grid.rds")
ctrp_signif_g2p_grid_kable <- knitr::kable(ctrp_signif_g2p_grid[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CTRP: Wilcoxon test results for point mutations in level A G2P genes and drugs (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-grouped context:
ctrp_signif_grid_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = ctrp_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
ctrp_signif_grid_lineage <- adj_signif(ctrp_signif_grid_lineage, alpha = 0.05)
ctrp_signif_grid_lineage <- ctrp_signif_grid_lineage[order(ctrp_signif_grid_lineage$p),]
saveRDS(ctrp_signif_grid_lineage, "./data_munging/rds/ctrp_signif_grid_lineage.rds")
ctrp_signif_g2p_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = ctrp_data_g2p_dgl, method = "wilcox.test", p.adjust.method = "BH")
ctrp_signif_g2p_lineage <- adj_signif(ctrp_signif_g2p_lineage, alpha = 0.05)
ctrp_signif_g2p_lineage <- ctrp_signif_g2p_lineage[order(ctrp_signif_g2p_lineage$p),]
saveRDS(ctrp_signif_g2p_lineage, "./data_munging/rds/ctrp_signif_g2p_lineage.rds")
ctrp_signif_g2p_lineage <- readRDS("./data_munging/rds/ctrp_signif_g2p_lineage.rds")
ctrp_signif_grid_lineage <- readRDS("./data_munging/rds/ctrp_signif_grid_lineage.rds")
ctrp_signif_g2p_lineage_kable <- knitr::kable(ctrp_signif_g2p_lineage[, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CTRP: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
ctrp_signif_grid_lineage_kable <- knitr::kable(ctrp_signif_grid_lineage[1:500, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "CTRP: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test all CRISPR genes in lineage-agnostic context:
# gdsc <- read.delim("./data_munging/data_drug_auc_gdsc_10_12_2018.csv", sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
# gdsc <- merge(drug_meta, gdsc, by = "CID")
# gdsc_melt <- melt(gdsc, id.vars = c("CID", "Drug"), measure.vars = colnames(gdsc)[3:ncol(gdsc)], variable.name = "accession_id", value.name = "AUC")
# gdsc_grid <- expand.grid("accession_id" = unique(gdsc_melt$accession_id), "Hugo_Symbol" = as.character(unique(crispr_data$Hugo_Symbol)))
# system.time({ gdsc_data_allgenes <- WilcoxonByDrugAllGenes(gdsc_melt, gdsc_grid, "GDSC") })
# saveRDS(gdsc_data_allgenes, "./data_munging/rds/gdsc_data_18Q3_allgenes.rds", compress = "xz")
gdsc_data_allgenes <- readRDS("./data_munging/rds/gdsc_data_18Q3_allgenes.rds")
gdsc_data_allgenes <- gdsc_data_allgenes[order(gdsc_data_allgenes$p),]
gdsc_data_allgenes_kable <- knitr::kable(gdsc_data_allgenes[1:1000, c("Hugo_Symbol", "Drug", "p", "p.adj", "p.format", "p.signif", "p.signif.adj")], caption = "GDSC: by-drug Wilcoxon test results for point mutations in genes targeted in CRISPR screen (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-agnostic context:
gdsc_signif_g2p <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene", data = gdsc_data_g2p, method = "wilcox.test", p.adjust.method = "BH")
gdsc_signif_g2p <- adj_signif(gdsc_signif_g2p, alpha = 0.05)
gdsc_signif_g2p <- gdsc_signif_g2p[order(gdsc_signif_g2p$p),]
saveRDS(gdsc_signif_g2p, "./data_munging/rds/gdsc_signif_g2p_gene.rds")
# Grid comparisons of G2P drugs and genes
gdsc_signif_g2p_grid <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene", data = gdsc_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
gdsc_signif_g2p_grid <- adj_signif(gdsc_signif_g2p_grid, alpha = 0.05)
gdsc_signif_g2p_grid <- gdsc_signif_g2p_grid[order(gdsc_signif_g2p_grid$p),]
saveRDS(gdsc_signif_g2p_grid, "./data_munging/rds/gdsc_signif_g2p_grid.rds")
gdsc_signif_g2p <- readRDS("./data_munging/rds/gdsc_signif_g2p_gene.rds")
gdsc_signif_g2p_kable <- knitr::kable(gdsc_signif_g2p[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj")], caption = "GDSC: Wilcoxon test results for level A G2P drug-gene associations (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
gdsc_signif_g2p_grid <- readRDS("./data_munging/rds/gdsc_signif_g2p_grid.rds")
gdsc_signif_g2p_grid_kable <- knitr::kable(gdsc_signif_g2p_grid[, c("Drug_Gene", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "GDSC: Wilcoxon test results for point mutations in level A G2P genes and drugs (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Test G2P genes in lineage-grouped context:
gdsc_signif_grid_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = gdsc_data_g2p_grid, method = "wilcox.test", p.adjust.method = "BH")
gdsc_signif_grid_lineage <- adj_signif(gdsc_signif_grid_lineage, alpha = 0.05)
gdsc_signif_grid_lineage <- gdsc_signif_grid_lineage[order(gdsc_signif_grid_lineage$p),]
saveRDS(gdsc_signif_grid_lineage, "./data_munging/rds/gdsc_signif_grid_lineage.rds")
gdsc_signif_g2p_lineage <- compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, group.by = "Drug_Gene_Lin", data = gdsc_data_g2p_dgl, method = "wilcox.test", p.adjust.method = "BH")
gdsc_signif_g2p_lineage <- adj_signif(gdsc_signif_g2p_lineage, alpha = 0.05)
gdsc_signif_g2p_lineage <- gdsc_signif_g2p_lineage[order(gdsc_signif_g2p_lineage$p),]
saveRDS(gdsc_signif_g2p_lineage, "./data_munging/rds/gdsc_signif_g2p_lineage.rds")
gdsc_signif_g2p_lineage <- readRDS("./data_munging/rds/gdsc_signif_g2p_lineage.rds")
gdsc_signif_grid_lineage <- readRDS("./data_munging/rds/gdsc_signif_grid_lineage.rds")
gdsc_signif_g2p_lineage_kable <- knitr::kable(gdsc_signif_g2p_lineage[, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "GDSC: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
gdsc_signif_grid_lineage_kable <- knitr::kable(gdsc_signif_grid_lineage[1:500, c("Drug_Gene_Lin", "p", "p.adj", "p.format", "p.signif", "p.signif.adj", "FDR_cutoff")], caption = "GDSC: Wilcoxon test results for level A G2P drug-gene associations across lineages (* p <= 0.05, ** p <= 0.01, *** p <= 0.001, **** p <= 0.0001)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Pearson and Spearman correlations for assessing effect of gene expression and copy number on gene essentiality score and AUC.
Example of how to describe Spearman correaltion results: A Spearman’s correlation was run to determine the relationship between 23 groundwater uranium and TDS values. There was a strong, positive monotonic correlation between Uranium and TDS ( = .71, n = 23, p < .001)."
crispr_pearson_res_GE <- crispr_data_g2p %>% group_by(Hugo_Symbol) %>%
summarize(p = cor.test(y = Score, x = RPKM, method = "pearson", use = "complete.obs")$p.value,
corr = cor.test(y = Score, x = RPKM, method = "pearson", use = "complete.obs")$estimate,
Metric = "Gene Expression")
crispr_pearson_res_CN <- crispr_data_g2p %>% group_by(Hugo_Symbol) %>%
summarize(p = cor.test(y = Score, x = CN, method = "pearson", use = "complete.obs")$p.value,
corr = cor.test(y = Score, x = CN, method = "pearson", use = "complete.obs")$estimate,
Metric = "Copy Number")
crispr_spearman_res_GE <- crispr_data_g2p %>% group_by(Hugo_Symbol) %>%
summarize(p = cor.test(y = Score, x = RPKM, method = "spearman", use = "complete.obs")$p.value,
corr = cor.test(y = Score, x = RPKM, method = "spearman", use = "complete.obs")$estimate,
Metric = "Gene Expression")
crispr_spearman_res_CN <- crispr_data_g2p %>% group_by(Hugo_Symbol) %>%
summarize(p = cor.test(y = Score, x = CN, method = "spearman", use = "complete.obs")$p.value,
corr = cor.test(y = Score, x = CN, method = "spearman", use = "complete.obs")$estimate,
Metric = "Copy Number")
g2p_druggene_tally <- g2p_filt %>% group_by(Drug_Gene) %>% tally() %>% drop_na()
# CCLE
ccle_cor_res <- ccle_data_g2p_grid %>% group_by(Drug_Gene) %>%
summarize(Pearson_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$p.value,
Pearson_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$p.value,
Spearman_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$p.value,
Spearman_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$p.value)
ccle_cor_res <- merge(g2p_druggene_tally, ccle_cor_res, by = "Drug_Gene", all.y = TRUE)
ccle_cor_res$n <- ifelse(is.na(ccle_cor_res$n), 0, ccle_cor_res$n)
ccle_cor_res$InG2P <- ifelse(ccle_cor_res$n != 0, "Yes", "No")
ccle_cor_res$Dataset <- "CCLE"
# CTRP
ctrp_cor_res <- ctrp_data_g2p_grid %>% group_by(Drug_Gene) %>%
summarize(Pearson_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$p.value,
Pearson_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$p.value,
Spearman_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$p.value,
Spearman_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$p.value)
ctrp_cor_res <- merge(g2p_druggene_tally, ctrp_cor_res, by = "Drug_Gene", all.y = TRUE)
ctrp_cor_res$n <- ifelse(is.na(ctrp_cor_res$n), 0, ctrp_cor_res$n)
ctrp_cor_res$InG2P <- ifelse(ctrp_cor_res$n != 0, "Yes", "No")
ctrp_cor_res$Dataset <- "CTRP"
# GDSC
gdsc_cor_res <- gdsc_data_g2p_grid %>% group_by(Drug_Gene) %>%
summarize(Pearson_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "pearson", use = "complete.obs")$p.value,
Pearson_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$estimate,
Pearson_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "pearson", use = "complete.obs")$p.value,
Spearman_Corr_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_GE = cor.test(y = AUC_zscore, x = RPKM, method = "spearman", use = "complete.obs")$p.value,
Spearman_Corr_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$estimate,
Spearman_Pval_CN = cor.test(y = AUC_zscore, x = CN, method = "spearman", use = "complete.obs")$p.value)
gdsc_cor_res <- merge(g2p_druggene_tally, gdsc_cor_res, by = "Drug_Gene", all.y = TRUE)
gdsc_cor_res$n <- ifelse(is.na(gdsc_cor_res$n), 0, gdsc_cor_res$n)
gdsc_cor_res$InG2P <- ifelse(gdsc_cor_res$n != 0, "Yes", "No")
gdsc_cor_res$Dataset <- "GDSC"
# All
g2p_cor <- rbind(ccle_cor_res, ctrp_cor_res, gdsc_cor_res)
g2p_cor$Pearson_GE_Abs <- abs(g2p_cor$Pearson_Corr_GE)
g2p_cor$Pearson_GE_Sign <- ifelse(sign(g2p_cor$Pearson_Corr_GE) == 1, "+", ifelse(sign(g2p_cor$Pearson_Corr_GE) == -1, "-", "0"))
g2p_cor$Pearson_CN_Abs <- abs(g2p_cor$Pearson_Corr_CN)
g2p_cor$Pearson_CN_Sign <- ifelse(sign(g2p_cor$Pearson_Corr_CN) == 1, "+", ifelse(sign(g2p_cor$Pearson_Corr_CN) == -1, "-", "0"))
g2p_cor$Spearman_GE_Abs <- abs(g2p_cor$Spearman_Corr_GE)
g2p_cor$Spearman_GE_Sign <- ifelse(sign(g2p_cor$Spearman_Corr_GE) == 1, "+", ifelse(sign(g2p_cor$Spearman_Corr_GE) == -1, "-", "0"))
g2p_cor$Spearman_CN_Abs <- abs(g2p_cor$Spearman_Corr_CN)
g2p_cor$Spearman_CN_Sign <- ifelse(sign(g2p_cor$Spearman_Corr_CN) == 1, "+", ifelse(sign(g2p_cor$Spearman_Corr_CN) == -1, "-", "0"))
Deprecated: T-tests of G2P presence/absence distributions
# ccle_pearson_ttest_GE <- t.test(ccle_cor_res$Pearson_Corr_GE ~ ccle_cor_res$InG2P)
# ccle_pearson_ttest_res_GE <- paste0("t(", round(ccle_pearson_ttest_GE$parameter, 0), ") = ", round(ccle_pearson_ttest_GE$statistic, 2), ", p = ", round(ccle_pearson_ttest_GE$p.value, 2))
# ccle_pearson_ttest_CN <- t.test(ccle_cor_res$Pearson_Corr_CN ~ ccle_cor_res$InG2P)
# ccle_pearson_ttest_res_CN <- paste0("t(", round(ccle_pearson_ttest_CN$parameter, 0), ") = ", round(ccle_pearson_ttest_CN$statistic, 2), ", p = ", round(ccle_pearson_ttest_CN$p.value, 2))
# ccle_spearman_ttest_GE <- t.test(ccle_cor_res$Spearman_Corr_GE ~ ccle_cor_res$InG2P)
# ccle_spearman_ttest_res_GE <- paste0("t(", round(ccle_spearman_ttest_GE$parameter, 0), ") = ", round(ccle_spearman_ttest_GE$statistic, 2), ", p = ", round(ccle_spearman_ttest_GE$p.value, 2))
# ccle_spearman_ttest_CN <- t.test(ccle_cor_res$Spearman_Corr_CN ~ ccle_cor_res$InG2P)
# ccle_spearman_ttest_res_CN <- paste0("t(", round(ccle_spearman_ttest_CN$parameter, 0), ") = ", round(ccle_spearman_ttest_CN$statistic, 2), ", p = ", formatC(ccle_spearman_ttest_CN$p.value, 2, format = "f"))
# ctrp_pearson_ttest_GE <- t.test(ctrp_cor_res$Pearson_Corr_GE ~ ctrp_cor_res$InG2P)
# ctrp_pearson_ttest_res_GE <- paste0("t(", round(ctrp_pearson_ttest_GE$parameter, 0), ") = ", round(ctrp_pearson_ttest_GE$statistic, 2), ", p ", ifelse(round(ctrp_pearson_ttest_GE$p.value, 3) == 0, "< 0.001", paste0("= ", round(ctrp_pearson_ttest_GE$p.value, 3))))
# ctrp_pearson_ttest_CN <- t.test(ctrp_cor_res$Pearson_Corr_CN ~ ctrp_cor_res$InG2P)
# ctrp_pearson_ttest_res_CN <- paste0("t(", round(ctrp_pearson_ttest_CN$parameter, 0), ") = ", round(ctrp_pearson_ttest_CN$statistic, 2), ", p = ", round(ctrp_pearson_ttest_CN$p.value, 3))
# ctrp_spearman_ttest_GE <- t.test(ctrp_cor_res$Spearman_Corr_GE ~ ctrp_cor_res$InG2P)
# ctrp_spearman_ttest_res_GE <- paste0("t(", round(ctrp_spearman_ttest_GE$parameter, 0), ") = ", round(ctrp_spearman_ttest_GE$statistic, 2), ", p = ", round(ctrp_spearman_ttest_GE$p.value, 2))
# ctrp_spearman_ttest_CN <- t.test(ctrp_cor_res$Spearman_Corr_CN ~ ctrp_cor_res$InG2P)
# ctrp_spearman_ttest_res_CN <- paste0("t(", round(ctrp_spearman_ttest_CN$parameter, 0), ") = ", round(ctrp_spearman_ttest_CN$statistic, 2), ", p = ", round(ctrp_spearman_ttest_CN$p.value, 2))
# gdsc_pearson_ttest_GE <- t.test(gdsc_cor_res$Pearson_Corr_GE ~ gdsc_cor_res$InG2P)
# gdsc_pearson_ttest_res_GE <- paste0("t(", round(gdsc_pearson_ttest_GE$parameter, 0), ") = ", round(gdsc_pearson_ttest_GE$statistic, 2), ", p = ", round(gdsc_pearson_ttest_GE$p.value, 3))
# gdsc_pearson_ttest_CN <- t.test(gdsc_cor_res$Pearson_Corr_CN ~ gdsc_cor_res$InG2P)
# gdsc_pearson_ttest_res_CN <- paste0("t(", round(gdsc_pearson_ttest_CN$parameter, 0), ") = ", round(gdsc_pearson_ttest_CN$statistic, 2), ", p = ", round(gdsc_pearson_ttest_CN$p.value, 3))
# gdsc_spearman_ttest_GE <- t.test(gdsc_cor_res$Spearman_Corr_GE ~ gdsc_cor_res$InG2P)
# gdsc_spearman_ttest_res_GE <- paste0("t(", round(gdsc_spearman_ttest_GE$parameter, 0), ") = ", round(gdsc_spearman_ttest_GE$statistic, 2), ", p = ", round(gdsc_spearman_ttest_GE$p.value, 2))
# gdsc_spearman_ttest_CN <- t.test(gdsc_cor_res$Spearman_Corr_CN ~ gdsc_cor_res$InG2P)
# gdsc_spearman_ttest_res_CN <- paste0("t(", round(gdsc_spearman_ttest_CN$parameter, 0), ") = ", round(gdsc_spearman_ttest_CN$statistic, 2), ", p = ", round(gdsc_spearman_ttest_CN$p.value, 2))
# g2p_cor_text <- data.frame(Dataset = c("CCLE", "CTRP", "GDSC"),
# Pearson_Test_Results_GE = c(ccle_pearson_ttest_res_GE, ctrp_pearson_ttest_res_GE, gdsc_pearson_ttest_res_GE),
# Pearson_Test_Results_CN = c(ccle_pearson_ttest_res_CN, ctrp_pearson_ttest_res_CN, gdsc_pearson_ttest_res_CN),
# Spearman_Test_Results_GE = c(ccle_spearman_ttest_res_GE, ctrp_spearman_ttest_res_GE, gdsc_spearman_ttest_res_GE),
# Spearman_Test_Results_CN = c(ccle_spearman_ttest_res_CN, ctrp_spearman_ttest_res_CN, gdsc_spearman_ttest_res_CN))
crispr_data_g2p_summ <- unique(crispr_data_g2p[, c("Hugo_Symbol", "N_Nonsilent_Mutant", "N_Nonsilent_Wildtype")])
crispr_data_g2p_summ$Percent_Mutant <- crispr_data_g2p_summ$N_Nonsilent_Mutant / (crispr_data_g2p_summ$N_Nonsilent_Mutant + crispr_data_g2p_summ$N_Nonsilent_Wildtype) * 100
rownames(crispr_data_g2p_summ) <- NULL
crispr_data_g2p_summ <- crispr_data_g2p_summ[order(crispr_data_g2p_summ$Percent_Mutant, decreasing = TRUE),]
# write.table(crispr_data_g2p_summ, file = "~/Desktop/crispr_data_g2p_summ_20181003.tsv", quote = FALSE, sep = "\t")
crispr_data_g2p_summ_kable <- knitr::kable(crispr_data_g2p_summ, caption = "CRISPR: Level A G2P genes mutation status summary") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
print("Drug-Gene")
## [1] "Drug-Gene"
g2p_druggene_tally <- g2p_filt %>% group_by(Drug_Gene) %>% tally() %>% drop_na()
nrow(g2p_druggene_tally)
## [1] 88
range(g2p_druggene_tally$n)
## [1] 1 103
mean(g2p_druggene_tally$n)
## [1] 10.13636
sd(g2p_druggene_tally$n)
## [1] 19.802
print("Drug")
## [1] "Drug"
g2p_drug_tally <- g2p_filt %>% group_by(Drug) %>% tally() %>% drop_na()
range(g2p_drug_tally$n)
## [1] 1 176
mean(g2p_drug_tally$n)
## [1] 15.68421
sd(g2p_drug_tally$n)
## [1] 34.62718
print("Gene")
## [1] "Gene"
g2p_gene_tally <- g2p_filt %>% group_by(Gene) %>% tally() %>% drop_na()
range(g2p_gene_tally$n)
## [1] 1 264
mean(g2p_gene_tally$n)
## [1] 26.23529
sd(g2p_gene_tally$n)
## [1] 61.58121
print("Drug-Gene-Lineage")
## [1] "Drug-Gene-Lineage"
g2p_dgl_tally <- g2p_filt %>% group_by(Drug, Gene, Drug_Gene, group_general_lineage_name) %>% tally() %>% drop_na()
nrow(g2p_dgl_tally)
## [1] 102
length(unique(g2p_dgl_tally$Drug))
## [1] 57
length(unique(g2p_dgl_tally$Gene))
## [1] 34
# unique(g2p_filt$Phenotype.Description)
crispr_ccl <- unique(select(crispr_data, DepMap_ID, CCLE_Name))
ccle_ccl <- unique(select(ccle_data, DepMap_ID, CCLE_Name, accession_id))
ctrp_ccl <- unique(select(ctrp_data, DepMap_ID, CCLE_Name, accession_id))
gdsc_ccl <- unique(select(gdsc_data, DepMap_ID, CCLE_Name, accession_id))
# all_ccl <- merge(ccle_ccl, ctrp_ccl, by = c("DepMap_ID", "CCLE_Name", "accession_id"), all = TRUE)
Summarize the Wilcoxon test p-values of the grid tests by mutation status. Grid tests were all lineage-agnostic G2P drug-gene associations tested across all screened lineages in each dataset:
g2p_dgl_mut_tally <- g2p %>% group_by(Drug_Gene, group_general_lineage_name) %>% tally()
ccle_dgl_summ <- ccle_signif_grid_lineage %>% separate(Drug_Gene_Lin, c("Drug", "Hugo_Symbol", "group_general_lineage_name"), "_")
ccle_dgl_summ$Drug_Gene <- paste0(ccle_dgl_summ$Drug, "_", ccle_dgl_summ$Hugo_Symbol)
ccle_dgl_summ <- merge(g2p_dgl_mut_tally, ccle_dgl_summ[, c("Drug_Gene", "group_general_lineage_name", "p")], by = c("Drug_Gene", "group_general_lineage_name"), all.y = TRUE)
ccle_dgl_summ$Drug_Gene_Lin <- paste0(ccle_dgl_summ$Drug_Gene, "_", ccle_dgl_summ$group_general_lineage_name)
ccle_dgl_summ$n <- ifelse(is.na(ccle_dgl_summ$n), 0, ccle_dgl_summ$n)
ccle_dgl_summ$InG2P <- ifelse(ccle_dgl_summ$n != 0, "Yes", "No")
ccle_dgl_summ$Dataset <- "CCLE"
ctrp_dgl_summ <- ctrp_signif_grid_lineage %>% separate(Drug_Gene_Lin, c("Drug", "Hugo_Symbol", "group_general_lineage_name"), "_")
ctrp_dgl_summ$Drug_Gene <- paste0(ctrp_dgl_summ$Drug, "_", ctrp_dgl_summ$Hugo_Symbol)
ctrp_dgl_summ <- merge(g2p_dgl_mut_tally, ctrp_dgl_summ[, c("Drug_Gene", "group_general_lineage_name", "p")], by = c("Drug_Gene", "group_general_lineage_name"), all.y = TRUE)
ctrp_dgl_summ$Drug_Gene_Lin <- paste0(ctrp_dgl_summ$Drug_Gene, "_", ctrp_dgl_summ$group_general_lineage_name)
ctrp_dgl_summ$n <- ifelse(is.na(ctrp_dgl_summ$n), 0, ctrp_dgl_summ$n)
ctrp_dgl_summ$InG2P <- ifelse(ctrp_dgl_summ$n != 0, "Yes", "No")
ctrp_dgl_summ$Dataset <- "CTRP"
gdsc_dgl_summ <- gdsc_signif_grid_lineage %>% separate(Drug_Gene_Lin, c("Drug", "Hugo_Symbol", "group_general_lineage_name"), "_")
gdsc_dgl_summ$Drug_Gene <- paste0(gdsc_dgl_summ$Drug, "_", gdsc_dgl_summ$Hugo_Symbol)
gdsc_dgl_summ <- merge(g2p_dgl_mut_tally, gdsc_dgl_summ[, c("Drug_Gene", "group_general_lineage_name", "p")], by = c("Drug_Gene", "group_general_lineage_name"), all.y = TRUE)
gdsc_dgl_summ$Drug_Gene_Lin <- paste0(gdsc_dgl_summ$Drug_Gene, "_", gdsc_dgl_summ$group_general_lineage_name)
gdsc_dgl_summ$n <- ifelse(is.na(gdsc_dgl_summ$n), 0, gdsc_dgl_summ$n)
gdsc_dgl_summ$InG2P <- ifelse(gdsc_dgl_summ$n != 0, "Yes", "No")
gdsc_dgl_summ$Dataset <- "GDSC"
dgl_mut_grid_summ <- rbind(ccle_dgl_summ, ctrp_dgl_summ, gdsc_dgl_summ)
Drug-gene-lineage (DGL) overlaps between CCLE, CTRP, and GDSC:
dgl_overlap_test_list <- list(CCLE = unique(ccle_data_g2p_dgl$Drug_Gene_Lin),
CTRP = unique(ctrp_data_g2p_dgl$Drug_Gene_Lin),
GDSC = unique(gdsc_data_g2p_dgl$Drug_Gene_Lin))
ItemsList <- venn(dgl_overlap_test_list, show.plot = TRUE)
ItemsList
## num CCLE CTRP GDSC
## 000 0 0 0 0
## 001 11 0 0 1
## 010 18 0 1 0
## 011 35 0 1 1
## 100 1 1 0 0
## 101 0 1 0 1
## 110 1 1 1 0
## 111 7 1 1 1
## attr(,"intersections")
## attr(,"intersections")$`CCLE:CTRP:GDSC`
## [1] "erlotinib_EGFR_lung cancer" "erlotinib_KRAS_lung cancer"
## [3] "lapatinib_KRAS_breast cancer" "lapatinib_ERBB2_breast cancer"
## [5] "erlotinib_MET_lung cancer" "nilotinib_ABL1_leukemia"
## [7] "nilotinib_UGT1A1_cancer"
##
## attr(,"intersections")$CCLE
## [1] "irinotecan_UGT1A1_cancer"
##
## attr(,"intersections")$CTRP
## [1] "azd_EGFR_lung cancer"
## [2] "azd_KRAS_lung cancer"
## [3] "neratinib_ERBB2_breast cancer"
## [4] "cyclophosphamide_ERBB2_breast cancer"
## [5] "fulvestrant_ERBB2_breast cancer"
## [6] "regorafenib_PDGFRA_cancer"
## [7] "regorafenib_KIT_cancer"
## [8] "vemurafenib_BRAF_skin cancer"
## [9] "vemurafenib_BRAF_lung cancer"
## [10] "vemurafenib_BRAF_cancer"
## [11] "vandetanib_RET_thyroid cancer"
## [12] "olaparib_BRCA1_ovarian cancer"
## [13] "olaparib_BRCA1_cancer"
## [14] "olaparib_BRCA2_ovarian cancer"
## [15] "olaparib_BRCA2_cancer"
## [16] "fluorouracil_DPYD_cancer"
## [17] "fulvestrant_ESR1_breast cancer"
## [18] "ibrutinib_MYD88_cancer"
##
## attr(,"intersections")$GDSC
## [1] "docetaxel_ERBB2_breast cancer"
## [2] "palbociclib_ERBB2_breast cancer"
## [3] "alectinib_ALK_lung cancer"
## [4] "alectinib_ROS1_lung cancer"
## [5] "ponatinib_ABL1_leukemia"
## [6] "bosutinib monohydrate_ABL1_leukemia"
## [7] "rucaparib_BRCA1_ovarian cancer"
## [8] "rucaparib_BRCA2_ovarian cancer"
## [9] "midostaurin_FLT3_leukemia"
## [10] "tretinoin_PML_leukemia"
## [11] "vismodegib_PTCH1_skin cancer"
##
## attr(,"intersections")$`CCLE:CTRP`
## [1] "selumetinib_KRAS_lung cancer"
##
## attr(,"intersections")$`CTRP:GDSC`
## [1] "gefitinib_EGFR_lung cancer" "afatinib_EGFR_lung cancer"
## [3] "afatinib_KRAS_lung cancer" "gefitinib_KRAS_lung cancer"
## [5] "afatinib_KRAS_breast cancer" "afatinib_ERBB2_lung cancer"
## [7] "afatinib_ERBB3_lung cancer" "gefitinib_MET_lung cancer"
## [9] "crizotinib_MET_lung cancer" "crizotinib_ALK_lung cancer"
## [11] "crizotinib_ROS1_lung cancer" "imatinib_PDGFRA_leukemia"
## [13] "imatinib_PDGFRA_cancer" "sunitinib_PDGFRA_cancer"
## [15] "dasatinib_ABL1_leukemia" "imatinib_ABL1_leukemia"
## [17] "imatinib_ABL1_cancer" "imatinib_KIT_skin cancer"
## [19] "imatinib_KIT_cancer" "sunitinib_KIT_cancer"
## [21] "imatinib_PDGFRB_cancer" "imatinib_PDGFB_cancer"
## [23] "trametinib_BRAF_skin cancer" "dabrafenib_BRAF_skin cancer"
## [25] "dabrafenib_BRAF_lung cancer" "trametinib_BRAF_lung cancer"
## [27] "trametinib_BRAF_cancer" "dabrafenib_BRAF_cancer"
## [29] "dabrafenib_G6PD_cancer" "cabozantinib_RET_lung cancer"
## [31] "cabozantinib_RET_thyroid cancer" "belinostat_UGT1A1_cancer"
## [33] "pazopanib_UGT1A1_cancer" "ruxolitinib_JAK2_cancer"
## [35] "temozolomide_MGMT_cancer"
##
## attr(,"class")
## [1] "venn"
Test the 102 level-A G2P DGL combinations:
ccle_lineages4pearson <- as.character(unique(ccle_data_g2p_dgl$group_general_lineage_name))[!is.na(as.character(unique(ccle_data_g2p_dgl$group_general_lineage_name)))]
ccle_signif_ge <- lapply(ccle_lineages4pearson, PearsonByLineage, variable = "GE", dataset = ccle_data_g2p_dgl)
ccle_signif_ge_all_lin <- rbindlist(ccle_signif_ge, use.names = TRUE)
ccle_signif_ge_all_lin$Dataset <- "CCLE"
ccle_signif_cn <- lapply(ccle_lineages4pearson, PearsonByLineage, variable = "CN", dataset = ccle_data_g2p_dgl)
ccle_signif_cn_all_lin <- rbindlist(ccle_signif_cn, use.names = TRUE)
ccle_signif_cn_all_lin$Dataset <- "CCLE"
ccle_signif_all_lin <- merge(ccle_signif_ge_all_lin, ccle_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
ctrp_lineages4pearson <- as.character(unique(ctrp_data_g2p_dgl$group_general_lineage_name))[!is.na(as.character(unique(ctrp_data_g2p_dgl$group_general_lineage_name)))]
ctrp_signif_ge <- lapply(ctrp_lineages4pearson, PearsonByLineage, variable = "GE", dataset = ctrp_data_g2p_dgl)
ctrp_signif_ge_all_lin <- rbindlist(ctrp_signif_ge, use.names = TRUE)
ctrp_signif_ge_all_lin$Dataset <- "CTRP"
ctrp_signif_cn <- lapply(ctrp_lineages4pearson, PearsonByLineage, variable = "CN", dataset = ctrp_data_g2p_dgl)
ctrp_signif_cn_all_lin <- rbindlist(ctrp_signif_cn, use.names = TRUE)
ctrp_signif_cn_all_lin$Dataset <- "CTRP"
ctrp_signif_all_lin <- merge(ctrp_signif_ge_all_lin, ctrp_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
gdsc_lineages4pearson <- as.character(unique(gdsc_data_g2p_dgl$group_general_lineage_name))[!is.na(as.character(unique(gdsc_data_g2p_dgl$group_general_lineage_name)))]
gdsc_signif_ge <- lapply(gdsc_lineages4pearson, PearsonByLineage, variable = "GE", dataset = gdsc_data_g2p_dgl)
gdsc_signif_ge_all_lin <- rbindlist(gdsc_signif_ge, use.names = TRUE)
gdsc_signif_ge_all_lin$Dataset <- "GDSC"
gdsc_signif_cn <- lapply(gdsc_lineages4pearson, PearsonByLineage, variable = "CN", dataset = gdsc_data_g2p_dgl)
gdsc_signif_cn_all_lin <- rbindlist(gdsc_signif_cn, use.names = TRUE)
gdsc_signif_cn_all_lin$Dataset <- "GDSC"
gdsc_signif_all_lin <- merge(gdsc_signif_ge_all_lin, gdsc_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
signif_all_lin <- rbind(ccle_signif_all_lin, ctrp_signif_all_lin, gdsc_signif_all_lin)
signif_all_lin$n <- ifelse(is.na(signif_all_lin$n), 0, signif_all_lin$n)
signif_all_lin$InG2P <- ifelse(signif_all_lin$n != 0, "Yes", "No")
signif_all_lin$Drug_Gene_Lin <- paste0(signif_all_lin$Drug_Gene, "_", signif_all_lin$group_general_lineage_name)
saveRDS(signif_all_lin, "./data_munging/rds/dgl_signif_pearson_g2p.rds")
dgl_signif_pearson_g2p <- readRDS("./data_munging/rds/dgl_signif_pearson_g2p.rds")
# CCLE mut, n = 7
ccle_mut <- filter(select(ccle_signif_g2p_lineage, Drug_Gene_Lin, p), p < 0.05)
ccle_mut_n <- nrow(select(ccle_signif_g2p_lineage, Drug_Gene_Lin, p))
# CCLE GE/CN, n = 9
ccle_ge <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "CCLE"), Pearson_Pval_GE < 0.05), Drug_Gene_Lin, Pearson_Pval_GE, Pearson_Corr_GE)
ccle_cn <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "CCLE"), Pearson_Pval_CN < 0.05), Drug_Gene_Lin, Pearson_Pval_CN, Pearson_Corr_CN)
ccle_Pear_n <- nrow(filter(dgl_signif_pearson_g2p, Dataset == "CCLE"))
ccle_dgl_summ <- merge(ccle_mut, merge(ccle_ge, ccle_cn, by = "Drug_Gene_Lin", all = TRUE), all = TRUE)
ccle_dgl_summ <- with(ccle_dgl_summ, cbind(colsplit(Drug_Gene_Lin, pattern = "_", names = c("Drug", "Gene", "Lineage")),
CCLE_p_mut = p,
CCLE_p_mut_Bonf = ifelse(p < 0.05 / ccle_mut_n, "Sig", NA),
CCLE_r_GE = Pearson_Corr_GE,
CCLE_p_GE = Pearson_Pval_GE,
CCLE_p_GE_Bonf = ifelse(Pearson_Pval_GE < 0.05 / ccle_Pear_n, "Sig", NA),
CCLE_r_CN = Pearson_Corr_CN,
CCLE_p_CN = Pearson_Pval_CN,
CCLE_p_CN_Bonf = ifelse(Pearson_Pval_CN < 0.05 / ccle_Pear_n, "Sig", NA)))
# CTRP mut, n = 48
ctrp_mut <- filter(select(ctrp_signif_g2p_lineage, Drug_Gene_Lin, p), p < 0.05)
ctrp_mut_n <- nrow(select(ctrp_signif_g2p_lineage, Drug_Gene_Lin, p))
# CTRP GE/CN, n = 61
ctrp_ge <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "CTRP"), Pearson_Pval_GE < 0.05), Drug_Gene_Lin, Pearson_Pval_GE, Pearson_Corr_GE)
ctrp_cn <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "CTRP"), Pearson_Pval_CN < 0.05), Drug_Gene_Lin, Pearson_Pval_CN, Pearson_Corr_CN)
ctrp_Pear_n <- nrow(filter(dgl_signif_pearson_g2p, Dataset == "CTRP"))
ctrp_dgl_summ <- merge(ctrp_mut, merge(ctrp_ge, ctrp_cn, by = "Drug_Gene_Lin", all = TRUE), all = TRUE)
ctrp_dgl_summ <- with(ctrp_dgl_summ, cbind(colsplit(Drug_Gene_Lin, pattern = "_", names = c("Drug", "Gene", "Lineage")),
CTRP_p_mut = p,
CTRP_p_mut_Bonf = ifelse(p < 0.05 / ctrp_mut_n, "Sig", NA),
CTRP_r_GE = Pearson_Corr_GE,
CTRP_p_GE = Pearson_Pval_GE,
CTRP_p_GE_Bonf = ifelse(Pearson_Pval_GE < 0.05 / ctrp_Pear_n, "Sig", NA),
CTRP_r_CN = Pearson_Corr_CN,
CTRP_p_CN = Pearson_Pval_CN,
CTRP_p_CN_Bonf = ifelse(Pearson_Pval_CN < 0.05 / ctrp_Pear_n, "Sig", NA)))
# GDSC mut, n = 39
gdsc_mut <- filter(select(gdsc_signif_g2p_lineage, Drug_Gene_Lin, p), p < 0.05)
gdsc_mut_n <- nrow(select(gdsc_signif_g2p_lineage, Drug_Gene_Lin, p))
# GDSC GE/CN, n = 53
gdsc_ge <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "GDSC"), Pearson_Pval_GE < 0.05), Drug_Gene_Lin, Pearson_Pval_GE, Pearson_Corr_GE)
gdsc_cn <- select(subset(filter(dgl_signif_pearson_g2p, Dataset == "GDSC"), Pearson_Pval_CN < 0.05), Drug_Gene_Lin, Pearson_Pval_CN, Pearson_Corr_CN)
gdsc_Pear_n <- nrow(filter(dgl_signif_pearson_g2p, Dataset == "GDSC"))
gdsc_dgl_summ <- merge(gdsc_mut, merge(gdsc_ge, gdsc_cn, by = "Drug_Gene_Lin", all = TRUE), all = TRUE)
gdsc_dgl_summ <- with(gdsc_dgl_summ, cbind(colsplit(Drug_Gene_Lin, pattern = "_", names = c("Drug", "Gene", "Lineage")),
GDSC_p_mut = p,
GDSC_p_mut_Bonf = ifelse(p < 0.05 / gdsc_mut_n, "Sig", NA),
GDSC_r_GE = Pearson_Corr_GE,
GDSC_p_GE = Pearson_Pval_GE,
GDSC_p_GE_Bonf = ifelse(Pearson_Pval_GE < 0.05 / gdsc_Pear_n, "Sig", NA),
GDSC_r_CN = Pearson_Corr_CN,
GDSC_p_CN = Pearson_Pval_CN,
GDSC_p_CN_Bonf = ifelse(Pearson_Pval_CN < 0.05 / gdsc_Pear_n, "Sig", NA)))
g2p_dgl_sig_summ <- merge(merge(ccle_dgl_summ, ctrp_dgl_summ, by = c("Drug", "Gene", "Lineage"), all = TRUE), gdsc_dgl_summ, by = c("Drug", "Gene", "Lineage"), all = TRUE)
g2p_dgl_sig_summ_labels <- c("CCLE_mut", "CCLE_GE", "CCLE_CN",
"CTRP_mut", "CTRP_GE", "CTRP_CN",
"GDSC_mut", "GDSC_GE", "GDSC_CN")
g2p_dgl_sig_summ_n <- c(sum(!is.na(g2p_dgl_sig_summ$CCLE_p_mut)),
sum(!is.na(g2p_dgl_sig_summ$CCLE_p_GE)),
sum(!is.na(g2p_dgl_sig_summ$CCLE_p_CN)),
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_mut)),
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_GE)),
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_CN)),
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_mut)),
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_GE)),
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_CN)))
g2p_dgl_sig_summ_percents <- c(sum(!is.na(g2p_dgl_sig_summ$CCLE_p_mut_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CCLE_p_mut)) * 100,
sum(!is.na(g2p_dgl_sig_summ$CCLE_p_GE_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CCLE_p_GE)) * 100,
sum(!is.na(g2p_dgl_sig_summ$CCLE_p_CN_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CCLE_p_CN)) * 100,
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_mut_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CTRP_p_mut)) * 100,
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_GE_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CTRP_p_GE)) * 100,
sum(!is.na(g2p_dgl_sig_summ$CTRP_p_CN_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$CTRP_p_CN)) * 100,
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_mut_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$GDSC_p_mut)) * 100,
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_GE_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$GDSC_p_GE)) * 100,
sum(!is.na(g2p_dgl_sig_summ$GDSC_p_CN_Bonf)) / sum(!is.na(g2p_dgl_sig_summ$GDSC_p_CN)) * 100)
g2p_dgl_sig_summ_stats <- data.frame("Dataset_Feature" = g2p_dgl_sig_summ_labels, "n_measured" = g2p_dgl_sig_summ_n, "Percent_Sig_Bonf" = g2p_dgl_sig_summ_percents)
Test all combinations of G2P drugs and genes in all lineages. InG2P indicates G2P drug-gene associations test across all lineages.
ccle_lineages4pearson <- as.character(unique(ccle_data_g2p_grid$group_general_lineage_name))[!is.na(as.character(unique(ccle_data_g2p_grid$group_general_lineage_name)))]
ccle_signif_ge <- lapply(ccle_lineages4pearson, PearsonByLineage, variable = "GE", dataset = ccle_data_g2p_grid)
ccle_signif_ge_all_lin <- rbindlist(ccle_signif_ge, use.names = TRUE)
ccle_signif_ge_all_lin$Dataset <- "CCLE"
ccle_signif_cn <- lapply(ccle_lineages4pearson, PearsonByLineage, variable = "CN", dataset = ccle_data_g2p_grid)
ccle_signif_cn_all_lin <- rbindlist(ccle_signif_cn, use.names = TRUE)
ccle_signif_cn_all_lin$Dataset <- "CCLE"
ccle_signif_all_lin <- merge(ccle_signif_ge_all_lin, ccle_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
ctrp_lineages4pearson <- as.character(unique(ctrp_data_g2p_grid$group_general_lineage_name))[!is.na(as.character(unique(ctrp_data_g2p_grid$group_general_lineage_name)))]
ctrp_signif_ge <- lapply(ctrp_lineages4pearson, PearsonByLineage, variable = "GE", dataset = ctrp_data_g2p_grid)
ctrp_signif_ge_all_lin <- rbindlist(ctrp_signif_ge, use.names = TRUE)
ctrp_signif_ge_all_lin$Dataset <- "CTRP"
ctrp_signif_cn <- lapply(ctrp_lineages4pearson, PearsonByLineage, variable = "CN", dataset = ctrp_data_g2p_grid)
ctrp_signif_cn_all_lin <- rbindlist(ctrp_signif_cn, use.names = TRUE)
ctrp_signif_cn_all_lin$Dataset <- "CTRP"
ctrp_signif_all_lin <- merge(ctrp_signif_ge_all_lin, ctrp_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
gdsc_lineages4pearson <- as.character(unique(gdsc_data_g2p_grid$group_general_lineage_name))[!is.na(as.character(unique(gdsc_data_g2p_grid$group_general_lineage_name)))]
gdsc_signif_ge <- lapply(gdsc_lineages4pearson, PearsonByLineage, variable = "GE", dataset = gdsc_data_g2p_grid)
gdsc_signif_ge_all_lin <- rbindlist(gdsc_signif_ge, use.names = TRUE)
gdsc_signif_ge_all_lin$Dataset <- "GDSC"
gdsc_signif_cn <- lapply(gdsc_lineages4pearson, PearsonByLineage, variable = "CN", dataset = gdsc_data_g2p_grid)
gdsc_signif_cn_all_lin <- rbindlist(gdsc_signif_cn, use.names = TRUE)
gdsc_signif_cn_all_lin$Dataset <- "GDSC"
gdsc_signif_all_lin <- merge(gdsc_signif_ge_all_lin, gdsc_signif_cn_all_lin, by = c("Drug_Gene", "group_general_lineage_name", "n", "Dataset"), all = TRUE)
signif_all_lin <- rbind(ccle_signif_all_lin, ctrp_signif_all_lin, gdsc_signif_all_lin)
signif_all_lin$n <- ifelse(is.na(signif_all_lin$n), 0, signif_all_lin$n)
signif_all_lin$InG2P <- ifelse(signif_all_lin$n != 0, "Yes", "No")
signif_all_lin$Drug_Gene_Lin <- paste0(signif_all_lin$Drug_Gene, "_", signif_all_lin$group_general_lineage_name)
saveRDS(signif_all_lin, "./data_munging/rds/dgl_signif_pearson_g2p_grid.rds")
dgl_signif_pearson_g2p_grid <- readRDS("./data_munging/rds/dgl_signif_pearson_g2p_grid.rds")
This section contains summary plots for the Wilcoxon tests and Pearson/Spearman correlations done for only level A G2P drug-gene associations (i.e. not the grid tests) and G2P genes in the case of the CRISPR dataset.
Venn diagram between GE and CN:
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
## NULL
crispr_spearman_res_GE_sig <- filter(crispr_spearman_res_GE, p < 0.05)
crispr_spearman_res_CN_sig <- filter(crispr_spearman_res_CN, p < 0.05)
crispr_venn <- venn.diagram(list("Gene Expression" = as.character(crispr_spearman_res_GE_sig$Hugo_Symbol), "Copy Number" = as.character(crispr_spearman_res_CN_sig$Hugo_Symbol)), fill = c("darkorchid4", "mediumorchid1"), lty = rep("blank", 2), alpha = c(0.5, 0.5), cat.pos = c(-2, 5), cat.cex = 1.5, cex = 1, filename = NULL)
# in GE only
crispr_venn[[5]]$label <- paste(sort(setdiff(as.character(crispr_spearman_res_GE_sig$Hugo_Symbol), as.character(crispr_spearman_res_CN_sig$Hugo_Symbol))), collapse = "\n")
# in CN only
crispr_venn[[6]]$label <- paste(sort(setdiff(as.character(crispr_spearman_res_CN_sig$Hugo_Symbol), as.character(crispr_spearman_res_GE_sig$Hugo_Symbol))), collapse = "\n")
# intesection
crispr_venn[[7]]$label <- paste(sort(intersect(as.character(crispr_spearman_res_CN_sig$Hugo_Symbol), as.character(crispr_spearman_res_GE_sig$Hugo_Symbol))), collapse = "\n")
ggsave("./plots/manuscript/crispr_venn.pdf", crispr_venn)
G2P genes with a significant effect on CERES score by mutation status, gene expression, and copy number. Summary plot using Pearson correlation coefficient for GE and CN:
crispr_res_mut <- select(crispr_signif_g2p_gene, Hugo_Symbol, p)
crispr_res_mut$corr <- 0
crispr_res_mut$Metric <- "Mutation Status"
crispr_res_mut$p_log10 <- -log10(as.numeric(crispr_res_mut$p))
subset(crispr_res_mut, p < 0.05)
## # A tibble: 4 x 5
## Hugo_Symbol p corr Metric p_log10
## <fct> <chr> <dbl> <chr> <dbl>
## 1 UGT1A1 0.0199589726138992 0 Mutation Status 1.70
## 2 TSC2 0.0237251683642146 0 Mutation Status 1.62
## 3 CYP19A1 0.0296979960495625 0 Mutation Status 1.53
## 4 DPYD 0.0409200622800623 0 Mutation Status 1.39
subset(crispr_pearson_res_GE, p < 0.05)
## # A tibble: 16 x 4
## Hugo_Symbol p corr Metric
## <fct> <dbl> <dbl> <chr>
## 1 EGFR 5.33e-14 -0.326 Gene Expression
## 2 KRAS 6.73e- 3 -0.120 Gene Expression
## 3 ERBB2 2.94e-11 -0.289 Gene Expression
## 4 MET 3.49e- 9 -0.258 Gene Expression
## 5 ALK 2.41e-10 -0.276 Gene Expression
## 6 PDGFRA 8.93e-20 -0.388 Gene Expression
## 7 PDGFRB 2.40e-16 -0.353 Gene Expression
## 8 PDGFB 3.37e- 2 0.0943 Gene Expression
## 9 BRCA1 2.36e- 4 0.163 Gene Expression
## 10 BRCA2 3.24e- 2 0.0948 Gene Expression
## 11 TSC1 4.20e- 2 0.0902 Gene Expression
## 12 TSC2 2.40e- 2 0.100 Gene Expression
## 13 AKT1 1.66e- 5 -0.190 Gene Expression
## 14 ESR1 3.76e-47 -0.580 Gene Expression
## 15 CYP19A1 1.11e- 6 0.214 Gene Expression
## 16 FLT3 1.47e-17 -0.366 Gene Expression
subset(crispr_pearson_res_CN, p < 0.05)
## # A tibble: 14 x 4
## Hugo_Symbol p corr Metric
## <fct> <dbl> <dbl> <chr>
## 1 EGFR 1.02e- 3 -0.145 Copy Number
## 2 KRAS 4.63e-10 -0.270 Copy Number
## 3 ERBB2 5.72e-14 -0.323 Copy Number
## 4 MET 4.80e- 7 -0.220 Copy Number
## 5 ALK 3.94e- 6 -0.202 Copy Number
## 6 PDGFRA 9.14e- 4 -0.146 Copy Number
## 7 ABL1 3.07e- 4 0.158 Copy Number
## 8 PDGFB 1.23e- 3 0.142 Copy Number
## 9 BRAF 2.28e- 8 -0.244 Copy Number
## 10 BRCA1 1.81e- 3 0.137 Copy Number
## 11 BRCA2 1.95e- 9 0.260 Copy Number
## 12 DPYD 2.50e- 2 -0.0987 Copy Number
## 13 TPMT 2.52e- 3 -0.133 Copy Number
## 14 FLT3 8.69e- 3 -0.115 Copy Number
crispr_pearson_res <- rbind(crispr_pearson_res_GE, crispr_pearson_res_CN)
crispr_pearson_res$Metric <- factor(crispr_pearson_res$Metric, levels = c("Gene Expression", "Copy Number"))
crispr_pearson_res$p_log10 <- -log10(crispr_pearson_res$p)
crispr_summ_res_mut_plot <- ggplot(data = subset(crispr_res_mut, p < 0.05)) +
facet_grid(~ Metric) +
scale_y_continuous(breaks = seq(0, 50, by = 5), labels = seq(0, 50, by = 5)) +
coord_cartesian(ylim = c(0, 50)) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, label = Hugo_Symbol, color = Metric), force = 10, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
theme(legend.position = "none", panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(5.5, 1.5, 5.5, 5.5), "pt")) +
scale_color_manual(values = c("Mutation Status" = "darkorchid4")) +
labs(y = "-log10(p-value)")
crispr_summ_res_pearson_plot <- ggplot(data = subset(crispr_pearson_res, p < 0.05)) +
facet_grid(~ Metric, scales = "free_x", space = "free") +
scale_y_continuous(breaks = seq(0, 50, by = 5), labels = seq(0, 50, by = 5)) +
scale_x_continuous(breaks = seq(-0.6, 0.3, by = 0.1), labels = formatC(seq(-0.6, 0.3, by = 0.1), digits = 2, format = "f")) +
coord_cartesian(ylim = c(0, 50)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Hugo_Symbol), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Pearson correlation coefficient (r)")
crispr_genomic_features_pearson_summ_plot <- plot_grid(crispr_summ_res_mut_plot, crispr_summ_res_pearson_plot, align = "h", rel_widths = c(1, 5))
crispr_genomic_features_pearson_summ_plot
# ggsave("./plots/manuscript/crispr_genomic_features_pearson_summ_plot.png", crispr_genomic_features_pearson_summ_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
Summary plot using Spearman correlation coefficient for GE and CN:
subset(crispr_spearman_res_GE, p < 0.05)
## # A tibble: 13 x 4
## Hugo_Symbol p corr Metric
## <fct> <dbl> <dbl> <chr>
## 1 EGFR 1.15e-22 -0.417 Gene Expression
## 2 KRAS 6.43e-10 -0.271 Gene Expression
## 3 ERBB2 1.05e-11 -0.297 Gene Expression
## 4 ERBB3 1.57e- 7 -0.230 Gene Expression
## 5 ALK 2.13e- 3 -0.136 Gene Expression
## 6 PDGFRA 5.92e- 5 -0.177 Gene Expression
## 7 KIT 2.01e- 2 -0.103 Gene Expression
## 8 PDGFRB 1.05e- 3 -0.145 Gene Expression
## 9 PDGFB 1.88e- 2 0.104 Gene Expression
## 10 BRAF 3.14e- 2 -0.0958 Gene Expression
## 11 BRCA1 4.97e- 6 0.202 Gene Expression
## 12 AKT1 5.63e- 3 -0.123 Gene Expression
## 13 PML 5.85e- 3 0.122 Gene Expression
subset(crispr_spearman_res_CN, p < 0.05)
## # A tibble: 12 x 4
## Hugo_Symbol p corr Metric
## <fct> <dbl> <dbl> <chr>
## 1 EGFR 2.16e- 4 -0.163 Copy Number
## 2 KRAS 1.13e- 7 -0.231 Copy Number
## 3 ERBB2 3.84e-10 -0.271 Copy Number
## 4 ABL1 1.96e- 3 0.136 Copy Number
## 5 PDGFB 9.65e- 4 0.145 Copy Number
## 6 BRAF 8.56e- 4 -0.147 Copy Number
## 7 BRCA1 1.07e- 3 0.144 Copy Number
## 8 BRCA2 6.58e- 7 0.217 Copy Number
## 9 DPYD 8.31e- 3 -0.116 Copy Number
## 10 TPMT 2.25e- 2 -0.101 Copy Number
## 11 PTCH1 6.26e- 3 0.120 Copy Number
## 12 JAK2 2.56e- 3 0.133 Copy Number
crispr_spearman_res <- rbind(crispr_spearman_res_GE, crispr_spearman_res_CN)
crispr_spearman_res$Metric <- factor(crispr_spearman_res$Metric, levels = c("Gene Expression", "Copy Number"))
crispr_spearman_res$p_log10 <- -log10(crispr_spearman_res$p)
crispr_summ_res_spearman_plot <- ggplot(data = subset(crispr_spearman_res, p < 0.05)) +
facet_grid(~ Metric, scales = "free_x", space = "free") +
scale_y_continuous(breaks = seq(0, 50, by = 5), labels = seq(0, 50, by = 5)) +
scale_x_continuous(breaks = seq(-0.6, 0.3, by = 0.1), labels = formatC(seq(-0.6, 0.3, by = 0.1), digits = 2, format = "f")) +
coord_cartesian(ylim = c(0, 50)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Hugo_Symbol), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Spearman's rank correlation coefficient (r_s)")
crispr_genomic_features_spearman_summ_plot <- plot_grid(crispr_summ_res_mut_plot, crispr_summ_res_spearman_plot, align = "h", rel_widths = c(1, 5))
crispr_genomic_features_spearman_summ_plot
# ggsave("./plots/manuscript/crispr_genomic_features_spearman_summ_plot.png", crispr_genomic_features_spearman_summ_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
Pearson correlations for gene expression and copy number:
crispr_pearson_res_GE <- transform(crispr_pearson_res_GE, Hugo_Symbol = reorder(Hugo_Symbol, corr))
crispr_pearson_res_GE$p_log10 <- -log10(crispr_pearson_res_GE$p)
crispr_pearson_res_CN <- transform(crispr_pearson_res_CN, Hugo_Symbol = reorder(Hugo_Symbol, corr))
crispr_pearson_res_CN$p_log10 <- -log10(crispr_pearson_res_CN$p)
crispr_ge_pearson_corr_plot <- ggplot(data = crispr_pearson_res_GE, aes(x = Hugo_Symbol, y = corr)) +
geom_point(aes(fill = p_log10), pch = 21, size = 5) +
scale_fill_gradient2(low = "white", mid = "white", high = "darkorchid4", midpoint = 1.3010299957) +
coord_cartesian(ylim = c(-0.65, 0.25)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(fill = "-log10(p)", x = "Gene", y = "Pearson Correlation Coefficient (r_s)", subtitle = "CRISPR: Correlation between AUC z-score and gene expression")
crispr_cn_pearson_corr_plot <- ggplot(data = crispr_pearson_res_CN, aes(x = Hugo_Symbol, y = corr)) +
geom_point(aes(fill = p_log10), pch = 21, size = 5) +
scale_fill_gradient2(low = "white", mid = "white", high = "darkorchid4", midpoint = 1.3010299957) +
coord_cartesian(ylim = c(-0.65, 0.25)) +
theme(legend.position = "bottom", axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(fill = "-log10(p)", x = "Gene", y = "Pearson Correlation Coefficient (r_s)", subtitle = "CRISPR: Correlation between AUC z-score and copy number")
crispr_pearson_corr_plot <- plot_grid(crispr_ge_pearson_corr_plot, crispr_cn_pearson_corr_plot, align = "h")
crispr_pearson_corr_plot
# ggsave("./plots/manuscript/crispr_pearson_corr_plot.png", crispr_pearson_corr_plot, device = "png", dpi = 450, width = 12, height = 6, units = "in")
Spearman correlations for gene expression and copy number:
crispr_spearman_res_GE <- transform(crispr_spearman_res_GE, Hugo_Symbol = reorder(Hugo_Symbol, corr))
crispr_spearman_res_GE$p_log10 <- -log10(crispr_spearman_res_GE$p)
crispr_spearman_res_CN <- transform(crispr_spearman_res_CN, Hugo_Symbol = reorder(Hugo_Symbol, corr))
crispr_spearman_res_CN$p_log10 <- -log10(crispr_spearman_res_CN$p)
spearman_KRAS <- rbind(filter(crispr_data_g2p, Hugo_Symbol == "KRAS") %>% summarize(p = cor.test(y = Score, x = RPKM, method = "spearman", use = "complete.obs")$p.value, corr = cor.test(y = Score, x = RPKM, method = "spearman", use = "complete.obs")$estimate, Metric = "Gene Expression"), filter(crispr_data_g2p, Hugo_Symbol == "KRAS") %>% summarize(p = cor.test(y = Score, x = CN, method = "spearman", use = "complete.obs")$p.value, corr = cor.test(y = Score, x = CN, method = "spearman", use = "complete.obs")$estimate, Metric = "Copy Number"))
crispr_ge_spearman_corr_inset <- ggplot(filter(crispr_data_g2p, Hugo_Symbol == "KRAS"), aes(x = RPKM, y = Score)) +
geom_point(pch = 21, fill = "cadetblue3", alpha = 0.6) +
labs(x = "KRAS Gene Expression (RPKM)", y = "CERES Score", subtitle = paste0("Spearman correlation coefficient = ", round(spearman_KRAS$corr[1], 2), " (p = ", formatC(as.numeric(spearman_KRAS$p[1]), format = "e", digits = 1), ")"))
crispr_cn_spearman_corr_inset <- ggplot(filter(crispr_data_g2p, Hugo_Symbol == "KRAS"), aes(x = CN, y = Score)) +
geom_point(pch = 21, fill = "palegreen2", alpha = 0.6) +
labs(x = "KRAS Copy Number (log2 ratio)", y = "CERES Score", subtitle = paste0("Spearman correlation coefficient = ", round(spearman_KRAS$corr[2], 2), " (p = ", formatC(as.numeric(spearman_KRAS$p[2]), format = "e", digits = 1), ")"))
crispr_ge_spearman_corr_plot <- ggplot(data = crispr_spearman_res_GE, aes(x = Hugo_Symbol, y = corr)) +
geom_point(aes(fill = p_log10), pch = 21, size = 5) +
scale_fill_gradient2(low = "white", mid = "white", high = "deepskyblue4", midpoint = 1.3010299957) +
coord_cartesian(ylim = c(-0.45, 0.2)) +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(fill = "-log10(p)", x = "Gene", y = expression("Spearman Correlation Coefficient"~(r[s]))) +
annotation_custom(ggplotGrob(crispr_ge_spearman_corr_inset), xmin = 12, xmax = 32, ymin = -0.15, ymax = -0.45) +
geom_curve(aes(x = 3.5, y = -0.27, xend = 14, yend = -0.165), color = "lightgray", curvature = -0.2, arrow = arrow(length = unit(0.03, "npc")))
crispr_cn_spearman_corr_plot <- ggplot(data = crispr_spearman_res_CN, aes(x = Hugo_Symbol, y = corr)) +
geom_point(aes(fill = p_log10), pch = 21, size = 5) +
scale_fill_gradient2(low = "white", mid = "white", high = "green4", midpoint = 1.3010299957) +
coord_cartesian(ylim = c(-0.45, 0.2)) +
theme(legend.position = "bottom", axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(fill = "-log10(p)", x = "Gene", y = "Spearman Correlation Coefficient (r_s)") +
annotation_custom(ggplotGrob(crispr_cn_spearman_corr_inset), xmin = 12, xmax = 32, ymin = -0.15, ymax = -0.45) +
geom_curve(aes(x = 2.5, y = -0.232, xend = 14, yend = -0.165), color = "lightgray", curvature = -0.2, arrow = arrow(length = unit(0.03, "npc")))
crispr_spearman_corr_plot <- plot_grid(crispr_ge_spearman_corr_plot, crispr_cn_spearman_corr_plot, align = "h")
crispr_spearman_corr_plot
# ggsave("./plots/manuscript/crispr_spearman_corr_plot.pdf", crispr_spearman_corr_plot, device = "pdf", dpi = 450, width = 16, height = 8, units = "in")
G2P drug-gene associations with a significant effect on AUC by mutation status, gene expression, and copy number. Summary plot using Pearson correlation coefficient for GE and CN:
ccle_res_mut <- select(ccle_signif_g2p, Drug_Gene, p)
ccle_res_mut$corr <- 0
ccle_res_mut$Metric <- "Mutation Status"
ccle_res_mut$p_log10 <- -log10(ccle_res_mut$p)
ccle_res_mut$Dataset <- "CCLE"
ccle_pearson_res_GE <- select(filter(ccle_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_GE, Pearson_Pval_GE, Dataset)
colnames(ccle_pearson_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
ccle_pearson_res_GE$Metric <- "Gene Expression"
subset(ccle_pearson_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 erlotinib_EGFR -0.3366681 7.246356e-14 CCLE Gene Expression
## 5 lapatinib_ERBB2 -0.4577990 6.239520e-26 CCLE Gene Expression
## 7 nilotinib_ABL1 -0.1981532 7.492822e-05 CCLE Gene Expression
ccle_pearson_res_CN <- select(filter(ccle_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_CN, Pearson_Pval_CN, Dataset)
colnames(ccle_pearson_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
ccle_pearson_res_CN$Metric <- "Copy Number"
subset(ccle_pearson_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 erlotinib_EGFR -0.2031493 6.219171e-06 CCLE Copy Number
## 5 lapatinib_ERBB2 -0.3898418 2.424545e-19 CCLE Copy Number
ccle_pearson_res_filt <- rbind(ccle_pearson_res_GE, ccle_pearson_res_CN)
ccle_pearson_res_filt$Metric <- factor(ccle_pearson_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
ccle_pearson_res_filt$p_log10 <- -log10(ccle_pearson_res_filt$p)
ccle_summ_res_mut_plot <- ggplot(data = subset(ccle_res_mut, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
scale_y_continuous(breaks = seq(0, 35, by = 5), labels = seq(0, 35, by = 5)) +
coord_cartesian(ylim = c(0, 25)) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, label = Drug_Gene, color = Metric), force = 10, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
theme(legend.position = "none", panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(5.5, 1.5, 5.5, 5.5), "pt"), strip.background.y = element_blank(), strip.text.y = element_blank()) +
scale_color_manual(values = c("Mutation Status" = "darkorchid4")) +
labs(y = "-log10(p-value)")
ccle_summ_res_pearson_plot <- ggplot(data = subset(ccle_pearson_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
coord_cartesian(ylim = c(0, 25)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Pearson correlation coefficient (r)")
ccle_genomic_features_pearson_summ_plot <- plot_grid(ccle_summ_res_mut_plot, ccle_summ_res_pearson_plot, align = "h", rel_widths = c(1, 5))
ccle_genomic_features_pearson_summ_plot
# ggsave("./plots/manuscript/ccle_genomic_features_pearson_summ_plot.png", ccle_genomic_features_pearson_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
Summary plot using Spearman correlation coefficient for GE and CN:
ccle_spearman_res_GE <- select(filter(ccle_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_GE, Spearman_Pval_GE, Dataset)
colnames(ccle_spearman_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
ccle_spearman_res_GE$Metric <- "Gene Expression"
subset(ccle_spearman_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 erlotinib_EGFR -0.1806802 8.471561e-05 CCLE Gene Expression
## 2 erlotinib_KRAS -0.1028632 2.527693e-02 CCLE Gene Expression
## 4 irinotecan_UGT1A1 0.1888238 9.947651e-04 CCLE Gene Expression
## 5 lapatinib_ERBB2 -0.2911699 1.026204e-10 CCLE Gene Expression
ccle_spearman_res_CN <- select(filter(ccle_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_CN, Spearman_Pval_CN, Dataset)
colnames(ccle_spearman_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
ccle_spearman_res_CN$Metric <- "Copy Number"
subset(ccle_spearman_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 3 erlotinib_MET 0.1087566 0.0159133177 CCLE Copy Number
## 5 lapatinib_ERBB2 -0.1479346 0.0009860106 CCLE Copy Number
ccle_spearman_res_filt <- rbind(ccle_spearman_res_GE, ccle_spearman_res_CN)
ccle_spearman_res_filt$Metric <- factor(ccle_spearman_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
ccle_spearman_res_filt$p_log10 <- -log10(ccle_spearman_res_filt$p)
ccle_summ_res_spearman_plot <- ggplot(data = subset(ccle_spearman_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
coord_cartesian(ylim = c(0, 25)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_label_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Spearman's rank correlation coefficient (r_s)")
ccle_genomic_features_spearman_summ_plot <- plot_grid(ccle_summ_res_mut_plot, ccle_summ_res_spearman_plot, align = "h", rel_widths = c(1, 5))
ccle_genomic_features_spearman_summ_plot
# ggsave("./plots/manuscript/ccle_genomic_features_spearman_summ_plot.png", ccle_genomic_features_spearman_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
G2P drug-gene associations with a significant effect on AUC by mutation status, gene expression, and copy number. Summary plot using Pearson correlation coefficient for GE and CN:
ctrp_res_mut <- select(ctrp_signif_g2p, Drug_Gene, p)
ctrp_res_mut$corr <- 0
ctrp_res_mut$Metric <- "Mutation Status"
ctrp_res_mut$p_log10 <- -log10(ctrp_res_mut$p)
ctrp_res_mut$Dataset <- "CTRP"
ctrp_pearson_res_GE <- select(filter(ctrp_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_GE, Pearson_Pval_GE, Dataset)
colnames(ctrp_pearson_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
ctrp_pearson_res_GE$Metric <- "Gene Expression"
subset(ctrp_pearson_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.27700499 4.685708e-13 CTRP Gene Expression
## 2 afatinib_ERBB2 -0.30388638 1.185520e-15 CTRP Gene Expression
## 3 afatinib_ERBB3 -0.14389941 2.011177e-04 CTRP Gene Expression
## 5 azd_EGFR 0.07642735 3.778631e-02 CTRP Gene Expression
## 9 crizotinib_ALK -0.17932624 1.062161e-06 CTRP Gene Expression
## 10 crizotinib_MET 0.09262027 1.223555e-02 CTRP Gene Expression
## 11 crizotinib_ROS1 0.08369722 2.344366e-02 CTRP Gene Expression
## 13 dabrafenib_BRAF -0.13580181 1.042047e-02 CTRP Gene Expression
## 16 erlotinib_EGFR -0.29662051 2.250871e-16 CTRP Gene Expression
## 19 fluorouracil_DPYD 0.07901692 3.115866e-02 CTRP Gene Expression
## 22 gefitinib_EGFR -0.23625727 1.449723e-10 CTRP Gene Expression
## 25 ibrutinib_MYD88 -0.17895196 4.129365e-05 CTRP Gene Expression
## 26 imatinib_ABL1 -0.09071727 1.414392e-02 CTRP Gene Expression
## 27 imatinib_KIT -0.08338536 2.416094e-02 CTRP Gene Expression
## 29 imatinib_PDGFRA -0.18127023 8.216287e-07 CTRP Gene Expression
## 31 lapatinib_ERBB2 -0.30626953 1.265786e-16 CTRP Gene Expression
## 33 neratinib_ERBB2 -0.36529699 3.230795e-25 CTRP Gene Expression
## 34 nilotinib_ABL1 -0.12805107 5.430712e-04 CTRP Gene Expression
## 36 olaparib_BRCA1 -0.16523104 5.799957e-06 CTRP Gene Expression
## 37 olaparib_BRCA2 -0.15721340 1.606024e-05 CTRP Gene Expression
## 40 regorafenib_PDGFRA -0.17989364 1.380583e-06 CTRP Gene Expression
## 41 ruxolitinib_JAK2 -0.33827973 2.704312e-21 CTRP Gene Expression
## 44 sunitinib_PDGFRA -0.23512534 1.152301e-10 CTRP Gene Expression
## 45 temozolomide_MGMT 0.15348477 2.617716e-05 CTRP Gene Expression
ctrp_pearson_res_CN <- select(filter(ctrp_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_CN, Pearson_Pval_CN, Dataset)
colnames(ctrp_pearson_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
ctrp_pearson_res_CN$Metric <- "Copy Number"
subset(ctrp_pearson_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.14970107 1.016142e-04 CTRP Copy Number
## 2 afatinib_ERBB2 -0.27815935 1.851309e-13 CTRP Copy Number
## 13 dabrafenib_BRAF -0.18614350 3.182922e-04 CTRP Copy Number
## 16 erlotinib_EGFR -0.10506205 4.094745e-03 CTRP Copy Number
## 18 erlotinib_MET 0.08711742 1.708976e-02 CTRP Copy Number
## 22 gefitinib_EGFR -0.09738942 8.461700e-03 CTRP Copy Number
## 25 ibrutinib_MYD88 0.08641841 4.613329e-02 CTRP Copy Number
## 27 imatinib_KIT -0.09316157 1.106459e-02 CTRP Copy Number
## 31 lapatinib_ERBB2 -0.28671220 7.290435e-15 CTRP Copy Number
## 33 neratinib_ERBB2 -0.32091772 8.278492e-20 CTRP Copy Number
## 41 ruxolitinib_JAK2 -0.08341880 2.197525e-02 CTRP Copy Number
## 43 sunitinib_KIT -0.08632231 1.836524e-02 CTRP Copy Number
## 44 sunitinib_PDGFRA -0.07769718 3.397524e-02 CTRP Copy Number
## 45 temozolomide_MGMT 0.08704025 1.674741e-02 CTRP Copy Number
## 48 vemurafenib_BRAF -0.11363950 2.089634e-03 CTRP Copy Number
ctrp_pearson_res_filt <- rbind(ctrp_pearson_res_GE, ctrp_pearson_res_CN)
ctrp_pearson_res_filt$Metric <- factor(ctrp_pearson_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
ctrp_pearson_res_filt$p_log10 <- -log10(ctrp_pearson_res_filt$p)
ctrp_summ_res_mut_plot <- ggplot(data = subset(ctrp_res_mut, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
scale_y_continuous(breaks = seq(0, 35, by = 5), labels = seq(0, 35, by = 5)) +
coord_cartesian(ylim = c(0, 35)) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, label = Drug_Gene, color = Metric), force = 15, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.9) +
theme(legend.position = "none", panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(5.5, 1.5, 5.5, 5.5), "pt"), strip.background.y = element_blank(), strip.text.y = element_blank()) +
scale_color_manual(values = c("Mutation Status" = "darkorchid4")) +
labs(y = "-log10(p-value)")
ctrp_summ_res_pearson_plot <- ggplot(data = subset(ctrp_pearson_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
coord_cartesian(ylim = c(0, 35)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Pearson correlation coefficient (r)")
ctrp_genomic_features_pearson_summ_plot <- plot_grid(ctrp_summ_res_mut_plot, ctrp_summ_res_pearson_plot, align = "h", rel_widths = c(1, 5))
ctrp_genomic_features_pearson_summ_plot
# ggsave("./plots/manuscript/ctrp_genomic_features_pearson_summ_plot.png", ctrp_genomic_features_pearson_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
Summary plot using Spearman correlation coefficient for GE and CN:
ctrp_spearman_res_GE <- select(filter(ctrp_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_GE, Spearman_Pval_GE, Dataset)
colnames(ctrp_spearman_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
ctrp_spearman_res_GE$Metric <- "Gene Expression"
subset(ctrp_spearman_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.16399791 2.363545e-05 CTRP Gene Expression
## 2 afatinib_ERBB2 -0.22139661 8.120121e-09 CTRP Gene Expression
## 3 afatinib_ERBB3 -0.18723814 1.203512e-06 CTRP Gene Expression
## 4 afatinib_KRAS -0.08068843 3.764787e-02 CTRP Gene Expression
## 5 azd_EGFR 0.23326505 1.369638e-10 CTRP Gene Expression
## 7 belinostat_UGT1A1 0.19286426 2.098214e-04 CTRP Gene Expression
## 9 crizotinib_ALK 0.08102139 2.849311e-02 CTRP Gene Expression
## 10 crizotinib_MET 0.34533264 6.696782e-22 CTRP Gene Expression
## 11 crizotinib_ROS1 0.19181901 1.666820e-07 CTRP Gene Expression
## 13 dabrafenib_BRAF -0.16616580 1.680102e-03 CTRP Gene Expression
## 16 erlotinib_EGFR -0.19487288 1.028117e-07 CTRP Gene Expression
## 18 erlotinib_MET -0.08852238 1.615211e-02 CTRP Gene Expression
## 19 fluorouracil_DPYD 0.12817583 4.570220e-04 CTRP Gene Expression
## 22 gefitinib_EGFR -0.09299698 1.266813e-02 CTRP Gene Expression
## 28 imatinib_PDGFB 0.22705008 5.460594e-10 CTRP Gene Expression
## 29 imatinib_PDGFRA 0.08075794 2.912542e-02 CTRP Gene Expression
## 31 lapatinib_ERBB2 -0.11633156 2.081204e-03 CTRP Gene Expression
## 33 neratinib_ERBB2 -0.07441253 4.107811e-02 CTRP Gene Expression
## 36 olaparib_BRCA1 -0.13684940 1.792261e-04 CTRP Gene Expression
## 37 olaparib_BRCA2 -0.12634388 5.423590e-04 CTRP Gene Expression
## 38 pazopanib_UGT1A1 0.10137558 6.013665e-03 CTRP Gene Expression
## 41 ruxolitinib_JAK2 -0.14009143 1.302860e-04 CTRP Gene Expression
## 45 temozolomide_MGMT 0.18602351 3.218486e-07 CTRP Gene Expression
ctrp_spearman_res_CN <- select(filter(ctrp_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_CN, Spearman_Pval_CN, Dataset)
colnames(ctrp_spearman_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
ctrp_spearman_res_CN$Metric <- "Copy Number"
subset(ctrp_spearman_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.10296399 7.692570e-03 CTRP Copy Number
## 2 afatinib_ERBB2 -0.09801497 1.083669e-02 CTRP Copy Number
## 13 dabrafenib_BRAF -0.10703010 3.961768e-02 CTRP Copy Number
## 17 erlotinib_KRAS -0.07696502 3.496236e-02 CTRP Copy Number
## 18 erlotinib_MET 0.17562756 1.323702e-06 CTRP Copy Number
## 19 fluorouracil_DPYD 0.09182214 1.159702e-02 CTRP Copy Number
## 25 ibrutinib_MYD88 0.15818442 2.458867e-04 CTRP Copy Number
## 31 lapatinib_ERBB2 -0.11839431 1.600547e-03 CTRP Copy Number
## 33 neratinib_ERBB2 -0.08414538 1.984871e-02 CTRP Copy Number
## 44 sunitinib_PDGFRA -0.08301361 2.345518e-02 CTRP Copy Number
## 45 temozolomide_MGMT 0.08633604 1.765481e-02 CTRP Copy Number
ctrp_spearman_res_filt <- rbind(ctrp_spearman_res_GE, ctrp_spearman_res_CN)
ctrp_spearman_res_filt$Metric <- factor(ctrp_spearman_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
ctrp_spearman_res_filt$p_log10 <- -log10(ctrp_spearman_res_filt$p)
ctrp_summ_res_spearman_plot <- ggplot(data = subset(ctrp_spearman_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
coord_cartesian(ylim = c(0, 35)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 20, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Spearman's rank correlation coefficient (r_s)")
ctrp_genomic_features_spearman_summ_plot <- plot_grid(ctrp_summ_res_mut_plot, ctrp_summ_res_spearman_plot, align = "h", rel_widths = c(1, 5))
ctrp_genomic_features_spearman_summ_plot
# ggsave("./plots/manuscript/ctrp_genomic_features_spearman_summ_plot.png", ctrp_genomic_features_spearman_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
G2P drug-gene associations with a significant effect on AUC by mutation status, gene expression, and copy number. Summary plot using Pearson correlation coefficient for GE and CN:
gdsc_res_mut <- select(gdsc_signif_g2p, Drug_Gene, p)
gdsc_res_mut$corr <- 0
gdsc_res_mut$Metric <- "Mutation Status"
gdsc_res_mut$p_log10 <- -log10(gdsc_res_mut$p)
gdsc_res_mut$Dataset <- "GDSC"
gdsc_pearson_res_GE <- select(filter(gdsc_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_GE, Pearson_Pval_GE, Dataset)
colnames(gdsc_pearson_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
gdsc_pearson_res_GE$Metric <- "Gene Expression"
subset(gdsc_pearson_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.33303159 8.652492e-16 GDSC Gene Expression
## 2 afatinib_ERBB2 -0.43254689 1.280805e-26 GDSC Gene Expression
## 3 afatinib_ERBB3 -0.18092508 1.865808e-05 GDSC Gene Expression
## 5 alectinib_ALK -0.11263501 5.314398e-03 GDSC Gene Expression
## 10 crizotinib_ALK -0.42535745 1.305363e-11 GDSC Gene Expression
## 11 crizotinib_MET -0.35204123 3.576379e-08 GDSC Gene Expression
## 15 dasatinib_ABL1 -0.22649720 6.190458e-04 GDSC Gene Expression
## 17 erlotinib_EGFR -0.29450714 1.904776e-05 GDSC Gene Expression
## 19 erlotinib_MET -0.24941466 3.213084e-04 GDSC Gene Expression
## 20 gefitinib_EGFR -0.40872642 1.605816e-23 GDSC Gene Expression
## 23 imatinib_ABL1 -0.31147922 1.233678e-06 GDSC Gene Expression
## 28 lapatinib_ERBB2 -0.41762769 8.801358e-11 GDSC Gene Expression
## 30 midostaurin_FLT3 -0.19161085 2.707673e-06 GDSC Gene Expression
## 31 nilotinib_ABL1 -0.24528936 1.201229e-08 GDSC Gene Expression
## 33 palbociclib_ERBB2 0.11564483 7.415054e-03 GDSC Gene Expression
## 35 ponatinib_ABL1 -0.12143722 3.082291e-03 GDSC Gene Expression
## 36 rucaparib_BRCA1 0.08921523 2.808740e-02 GDSC Gene Expression
## 38 ruxolitinib_JAK2 -0.62181562 7.208779e-67 GDSC Gene Expression
## 41 temozolomide_MGMT 0.18325912 6.359042e-06 GDSC Gene Expression
## 42 trametinib_BRAF 0.23318770 1.291202e-08 GDSC Gene Expression
gdsc_pearson_res_CN <- select(filter(gdsc_cor_res, InG2P == "Yes"), Drug_Gene, Pearson_Corr_CN, Pearson_Pval_CN, Dataset)
colnames(gdsc_pearson_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
gdsc_pearson_res_CN$Metric <- "Copy Number"
subset(gdsc_pearson_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.2350772 1.860050e-08 GDSC Copy Number
## 2 afatinib_ERBB2 -0.4060245 1.339596e-23 GDSC Copy Number
## 10 crizotinib_ALK -0.1530971 1.786548e-02 GDSC Copy Number
## 11 crizotinib_MET -0.1775654 5.912230e-03 GDSC Copy Number
## 13 dabrafenib_BRAF -0.1737545 2.797815e-05 GDSC Copy Number
## 20 gefitinib_EGFR -0.2550968 1.078784e-09 GDSC Copy Number
## 23 imatinib_ABL1 0.1843771 4.156196e-03 GDSC Copy Number
## 26 imatinib_PDGFRA -0.1379720 3.263682e-02 GDSC Copy Number
## 28 lapatinib_ERBB2 -0.3205038 7.237051e-07 GDSC Copy Number
## 31 nilotinib_ABL1 0.1860943 1.476229e-05 GDSC Copy Number
## 35 ponatinib_ABL1 0.1207735 3.071039e-03 GDSC Copy Number
## 38 ruxolitinib_JAK2 -0.2314618 5.664756e-09 GDSC Copy Number
## 39 sunitinib_KIT -0.1387294 3.391719e-02 GDSC Copy Number
gdsc_pearson_res_filt <- rbind(gdsc_pearson_res_GE, gdsc_pearson_res_CN)
gdsc_pearson_res_filt$Metric <- factor(gdsc_pearson_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
gdsc_pearson_res_filt$p_log10 <- -log10(gdsc_pearson_res_filt$p)
gdsc_summ_res_mut_plot <- ggplot(data = subset(gdsc_res_mut, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
scale_y_continuous(breaks = seq(0, 16, by = 1), labels = seq(0, 16, by = 1)) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, label = Drug_Gene, color = Metric), force = 15, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.9) +
theme(legend.position = "none", panel.grid.major.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(5.5, 1.5, 5.5, 5.5), "pt"), strip.background.y = element_blank(), strip.text.y = element_blank()) +
scale_color_manual(values = c("Mutation Status" = "darkorchid4")) +
labs(y = "-log10(p-value)")
gdsc_summ_res_pearson_plot <- ggplot(data = subset(gdsc_pearson_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
scale_y_continuous(breaks = seq(0, 100, by = 10), labels = seq(0, 100, by = 10)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 10, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Pearson correlation coefficient (r)")
gdsc_genomic_features_pearson_summ_plot <- plot_grid(gdsc_summ_res_mut_plot, gdsc_summ_res_pearson_plot, align = "h", rel_widths = c(1, 5))
gdsc_genomic_features_pearson_summ_plot
# ggsave("./plots/manuscript/gdsc_genomic_features_pearson_summ_plot.png", gdsc_genomic_features_pearson_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
Summary plot using Spearman correlation coefficient for GE and CN:
gdsc_spearman_res_GE <- select(filter(gdsc_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_GE, Spearman_Pval_GE, Dataset)
colnames(gdsc_spearman_res_GE) <- c("Drug_Gene", "corr", "p", "Dataset")
gdsc_spearman_res_GE$Metric <- "Gene Expression"
subset(gdsc_spearman_res_GE, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.44291125 5.680335e-28 GDSC Gene Expression
## 2 afatinib_ERBB2 -0.53133832 0.000000e+00 GDSC Gene Expression
## 3 afatinib_ERBB3 -0.37012268 2.139226e-19 GDSC Gene Expression
## 4 afatinib_KRAS 0.17418472 3.857124e-05 GDSC Gene Expression
## 7 belinostat_UGT1A1 0.10072735 1.413007e-02 GDSC Gene Expression
## 9 cabozantinib_RET 0.08995695 2.569136e-02 GDSC Gene Expression
## 15 dasatinib_ABL1 -0.15537716 1.979070e-02 GDSC Gene Expression
## 16 docetaxel_ERBB2 -0.19978623 2.248355e-06 GDSC Gene Expression
## 17 erlotinib_EGFR -0.31236839 5.404734e-06 GDSC Gene Expression
## 18 erlotinib_KRAS 0.14753730 3.529927e-02 GDSC Gene Expression
## 19 erlotinib_MET -0.23232149 8.512432e-04 GDSC Gene Expression
## 20 gefitinib_EGFR -0.38517060 7.382443e-21 GDSC Gene Expression
## 21 gefitinib_KRAS 0.14741333 5.243768e-04 GDSC Gene Expression
## 22 gefitinib_MET -0.28525040 1.206462e-11 GDSC Gene Expression
## 23 imatinib_ABL1 -0.22175009 6.699703e-04 GDSC Gene Expression
## 25 imatinib_PDGFB -0.20427018 1.762465e-03 GDSC Gene Expression
## 28 lapatinib_ERBB2 -0.48764198 0.000000e+00 GDSC Gene Expression
## 29 lapatinib_KRAS 0.20062452 2.717575e-03 GDSC Gene Expression
## 36 rucaparib_BRCA1 0.18307976 5.951358e-06 GDSC Gene Expression
## 37 rucaparib_BRCA2 0.13932807 5.830335e-04 GDSC Gene Expression
## 39 sunitinib_KIT 0.16595488 1.228167e-02 GDSC Gene Expression
## 41 temozolomide_MGMT 0.18463442 5.401557e-06 GDSC Gene Expression
## 42 trametinib_BRAF 0.26297587 1.204429e-10 GDSC Gene Expression
gdsc_spearman_res_CN <- select(filter(gdsc_cor_res, InG2P == "Yes"), Drug_Gene, Spearman_Corr_CN, Spearman_Pval_CN, Dataset)
colnames(gdsc_spearman_res_CN) <- c("Drug_Gene", "corr", "p", "Dataset")
gdsc_spearman_res_CN$Metric <- "Copy Number"
subset(gdsc_spearman_res_CN, p < 0.05)
## Drug_Gene corr p Dataset Metric
## 1 afatinib_EGFR -0.2190284 1.689794e-07 GDSC Copy Number
## 2 afatinib_ERBB2 -0.2245173 8.092089e-08 GDSC Copy Number
## 20 gefitinib_EGFR -0.1659511 8.565399e-05 GDSC Copy Number
## 27 imatinib_PDGFRB -0.1607070 1.267085e-02 GDSC Copy Number
## 35 ponatinib_ABL1 0.1010486 1.335042e-02 GDSC Copy Number
gdsc_spearman_res_filt <- rbind(gdsc_spearman_res_GE, gdsc_spearman_res_CN)
gdsc_spearman_res_filt$Metric <- factor(gdsc_spearman_res_filt$Metric, levels = c("Gene Expression", "Copy Number"))
gdsc_spearman_res_filt$p_log10 <- -log10(gdsc_spearman_res_filt$p)
gdsc_summ_res_spearman_plot <- ggplot(data = subset(gdsc_spearman_res_filt, p < 0.05)) +
facet_grid(Dataset ~ Metric) +
coord_cartesian(ylim = c(0, 30)) +
scale_y_continuous(breaks = seq(0, 50, by = 5), labels = seq(0, 50, by = 5)) +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Metric), alpha = 0.5) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Metric, label = Drug_Gene), force = 20, nudge_y = 2, nudge_x = 0.05, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("Gene Expression" = "darkorchid1", "Copy Number" = "mediumorchid1")) +
theme(legend.position = "none", axis.title.y = element_blank(), plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
annotate(geom = "text", x = 0, y = 30, label = c("afatinib_ERBB2 and lapatinib_ERBB2\n-log10(p-value) >> 100", ""), size = 3) +
labs(x = "Spearman's rank correlation coefficient (r_s)")
gdsc_genomic_features_spearman_summ_plot <- plot_grid(gdsc_summ_res_mut_plot, gdsc_summ_res_spearman_plot, align = "h", rel_widths = c(1, 5))
gdsc_genomic_features_spearman_summ_plot
# ggsave("./plots/manuscript/gdsc_genomic_features_spearman_summ_plot.png", gdsc_genomic_features_spearman_summ_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
G2P drug-gene associations with a significant effect on AUC by mutation status:
all_mut_res <- rbind(ccle_res_mut, ctrp_res_mut, gdsc_res_mut)
all_mut_summ_plot <- ggplot(data = subset(all_mut_res, p < 0.05)) +
facet_grid(~ Dataset) +
geom_point(aes(x = corr, y = p_log10, color = Dataset), alpha = 0.5) +
scale_y_continuous(breaks = seq(0, 12, by = 2), labels = seq(0, 12, by = 2)) +
scale_x_continuous(breaks = seq(0, 0.1, by = 0.1), labels = seq(0, 0.1, by = 0.1)) +
coord_cartesian(xlim = c(0, 0.05)) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Dataset, label = Drug_Gene), nudge_x = 0.05, direction = "y", hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
theme(legend.position = "none", plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt"), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
labs(y = "-log10(p-value)")
all_mut_summ_plot
# ggsave("./plots/manuscript/all_mut_summ_plot.png", all_mut_summ_plot, device = "png", dpi = 450, width = 12, height = 6, units = "in")
G2P drug-gene associations with a significant effect on AUC by gene expression and copy number:
all_pearson_res <- rbind(ccle_pearson_res_filt, ctrp_pearson_res_filt, gdsc_pearson_res_filt)
all_pearson_summ_plot <- ggplot(data = subset(all_pearson_res, p < 0.05)) +
facet_grid(Dataset ~ Metric, scales = "free", space = "free") +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Dataset), alpha = 0.5) +
scale_y_continuous(breaks = seq(0, 60, by = 10), labels = seq(0, 60, by = 10)) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Dataset, label = Drug_Gene), force = 10, nudge_y = 3, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
theme(legend.position = "none", plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
labs(x = "Pearson correlation coefficient (r)", y = "-log10(p-value)")
all_pearson_summ_plot
# ggsave("./plots/manuscript/all_pearson_summ_plot.png", all_pearson_summ_plot, device = "png", dpi = 450, width = 12, height = 10, units = "in")
all_spearman_res <- rbind(ccle_spearman_res_filt, ctrp_spearman_res_filt, gdsc_spearman_res_filt)
all_spearman_summ_plot <- ggplot(data = subset(all_spearman_res, p < 0.05)) +
facet_grid(Dataset ~ Metric, scales = "free", space = "free") +
geom_vline(xintercept = 0, color = "grey50", lty = 2, lwd = 0.3) +
geom_point(aes(x = corr, y = p_log10, color = Dataset), alpha = 0.5) +
scale_y_continuous(breaks = seq(0, 60, by = 10), labels = seq(0, 60, by = 10)) +
geom_text_repel(mapping = aes(y = p_log10, x = corr, color = Dataset, label = Drug_Gene), force = 10, nudge_y = 1, size = 2.5, segment.size = 0.2, segment.colour = "grey50", box.padding = 0.6) +
scale_color_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
theme(legend.position = "none", plot.margin = unit(c(5.5, 5.5, 5.5, 1.5), "pt")) +
annotate(geom = "text", x = 0, y = 30, label = c("", "", "", "", "afatinib_ERBB2 and lapatinib_ERBB2\n-log10(p-value) >> 100", ""), size = 3) +
labs(x = "Pearson correlation coefficient (r)", y = "-log10(p-value)")
all_spearman_summ_plot
# ggsave("./plots/manuscript/all_spearman_summ_plot.png", all_spearman_summ_plot, device = "png", dpi = 450, width = 12, height = 10, units = "in")
Venn diagram of drug-gene associations:
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
## NULL
ccle_mut_dgs <- as.character(ccle_signif_g2p_grid[!is.na(ccle_signif_g2p_grid$p.signif.adj),]$Drug_Gene)
ctrp_mut_dgs <- as.character(ctrp_signif_g2p_grid[!is.na(ctrp_signif_g2p_grid$p.signif.adj),]$Drug_Gene)
gdsc_mut_dgs <- as.character(gdsc_signif_g2p_grid[!is.na(gdsc_signif_g2p_grid$p.signif.adj),]$Drug_Gene)
ccle_ge_dgs <- as.character(ccle_cor_res[ccle_cor_res$Spearman_Pval_GE < 0.05,]$Drug_Gene)
ctrp_ge_dgs <- as.character(ctrp_cor_res[ctrp_cor_res$Spearman_Pval_GE < 0.05,]$Drug_Gene)
gdsc_ge_dgs <- as.character(gdsc_cor_res[gdsc_cor_res$Spearman_Pval_GE < 0.05,]$Drug_Gene)
ccle_cn_dgs <- as.character(ccle_cor_res[ccle_cor_res$Spearman_Pval_CN < 0.05,]$Drug_Gene)
ctrp_cn_dgs <- as.character(ctrp_cor_res[ctrp_cor_res$Spearman_Pval_CN < 0.05,]$Drug_Gene)
gdsc_cn_dgs <- as.character(gdsc_cor_res[gdsc_cor_res$Spearman_Pval_CN < 0.05,]$Drug_Gene)
drug_venn_mut <- venn.diagram(list("CCLE" = ccle_mut_dgs, "CTRP" = ctrp_mut_dgs, "GDSC" = gdsc_mut_dgs), resolution = 5000, main = "A", main.pos = c(0, 1), main.fontface = "bold", lty = rep("blank", 3), fill = c("springgreen4", "steelblue4", "turquoise4"), alpha = c(0.5, 0.5, 0.5), cat.pos = c(-30, 50, 120), cat.dist = c(0.001, 0.02, 0.0001), cat.cex = 1.5, cex = 1, filename = NULL, height = 4000)
# in GDSC only
drug_venn_mut[[7]]$label <- paste(sort(setdiff(setdiff(gdsc_mut_dgs, ctrp_mut_dgs), ccle_mut_dgs)), collapse = "\n")
# in GDSC and CTRP only
drug_venn_mut[[8]]$label <- paste(intersect(gdsc_mut_dgs, ctrp_mut_dgs), collapse = "\n")
# in CTRP only
drug_venn_mut[[9]]$label <- paste(sort(setdiff(setdiff(ctrp_mut_dgs, gdsc_mut_dgs), ccle_mut_dgs)), collapse = "\n")
# in CTRP and CCLE only
drug_venn_mut[[10]]$label <- intersect(ccle_mut_dgs, ctrp_mut_dgs)
drug_venn_ge <- venn.diagram(list("CCLE" = ccle_ge_dgs, "CTRP" = ctrp_ge_dgs, "GDSC" = gdsc_ge_dgs), resolution = 5000, main = "B", main.pos = c(0, 1), main.fontface = "bold", lty = rep("blank", 3), fill = c("springgreen4", "steelblue4", "turquoise4"), alpha = c(0.5, 0.5, 0.5), cat.pos = c(-30, 30, 180), cat.dist = c(0.001, 0.001, 0.001), cat.cex = 1.5, cex = 1, filename = NULL)
drug_venn_cn <- venn.diagram(list("CCLE" = ccle_cn_dgs, "CTRP" = ctrp_cn_dgs, "GDSC" = gdsc_cn_dgs), resolution = 5000, main = "C", main.pos = c(0, 1), main.fontface = "bold", lty = rep("blank", 3), fill = c("springgreen4", "steelblue4", "turquoise4"), alpha = c(0.5, 0.5, 0.5), cat.pos = c(0, 0, 0), cat.dist = c(0.11, 0.11, 0.001), cat.cex = 1.5, cex = 1, filename = NULL)
grid.arrange(grobTree(drug_venn_mut),
grobTree(drug_venn_ge),
grobTree(drug_venn_cn),
ncol = 1,
heights = c(2/5, 2/5, 1/5))
drugscreen_venns <- arrangeGrob(grobTree(drug_venn_mut),
grobTree(drug_venn_ge),
grobTree(drug_venn_cn),
ncol = 1,
heights = c(2/5, 2/5, 1/5))
# ggsave(file = "./plots/manuscript/drugscreen_venns.pdf", drugscreen_venns) #saves g
# png("./plots/manuscript/drugscreen_venn_mut.png")
# grid.newpage()
# grid.draw(drug_venn_mut)
# dev.off()
Mutation summaries for G2P grid comparisons:
ccle_mut_summ <- merge(g2p_druggene_tally, ccle_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
ccle_mut_summ$n <- ifelse(is.na(ccle_mut_summ$n), 0, ccle_mut_summ$n)
ccle_mut_summ$InG2P <- ifelse(ccle_mut_summ$n != 0, "Yes", "No")
ccle_mut_summ$Dataset <- "CCLE"
ctrp_mut_summ <- merge(g2p_druggene_tally, ctrp_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
ctrp_mut_summ$n <- ifelse(is.na(ctrp_mut_summ$n), 0, ctrp_mut_summ$n)
ctrp_mut_summ$InG2P <- ifelse(ctrp_mut_summ$n != 0, "Yes", "No")
ctrp_mut_summ$Dataset <- "CTRP"
gdsc_mut_summ <- merge(g2p_druggene_tally, gdsc_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
gdsc_mut_summ$n <- ifelse(is.na(gdsc_mut_summ$n), 0, gdsc_mut_summ$n)
gdsc_mut_summ$InG2P <- ifelse(gdsc_mut_summ$n != 0, "Yes", "No")
gdsc_mut_summ$Dataset <- "GDSC"
g2p_mut_all_summ <- rbind(ccle_mut_summ, ctrp_mut_summ, gdsc_mut_summ)
g2p_mut_all_summ$p_log10 <- -log10(g2p_mut_all_summ$p)
pval_summ_mut <- g2p_mut_all_summ
pval_summ_mut$Measure <- "Mutation Status"
pval_summ_mut$Drug_Gene_Ordered <- NULL
pval_summ_ge <- select(g2p_cor, Drug_Gene, n, Pearson_Pval_GE, InG2P, Dataset)
colnames(pval_summ_ge)[colnames(pval_summ_ge) == "Pearson_Pval_GE"] <- "p"
pval_summ_ge$p_log10 <- -log10(pval_summ_ge$p)
pval_summ_ge$Measure <- "Gene Expression"
pval_summ_cn <- select(g2p_cor, Drug_Gene, n, Pearson_Pval_CN, InG2P, Dataset)
colnames(pval_summ_cn)[colnames(pval_summ_cn) == "Pearson_Pval_CN"] <- "p"
pval_summ_cn$p_log10 <- -log10(pval_summ_cn$p)
pval_summ_cn$Measure <- "Copy Number"
pval_summ <- rbind(pval_summ_mut, pval_summ_ge, pval_summ_cn)
pval_summ$Measure <- factor(pval_summ$Measure, levels = c("Mutation Status", "Gene Expression", "Copy Number"))
pval_summ_plot <- ggplot(data = pval_summ, aes(x = p_log10, y = n, color = Dataset, shape = Dataset, label = Drug_Gene)) +
facet_wrap(~ Measure, scales = "free_x") +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("green3", "darkslategray4", "darkturquoise")) +
geom_text_repel(data = filter(pval_summ, n > 20 & Measure == "Mutation Status"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_x = 9, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
geom_text_repel(data = filter(pval_summ, n > 20 & Measure == "Gene Expression"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_x = 50, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
geom_text_repel(data = filter(pval_summ, n > 20 & Measure == "Copy Number"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_x = 20, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
labs(x = "-log10(p-value)", y = "Number of level A G2P associations") +
theme(legend.position = "bottom")
pval_summ_plot
# ggsave("./plots/manuscript/all_pval_summ_plot.png", pval_summ_plot, device = "png", dpi = 450, width = 12, height = 6, units = "in")
pval_summ_plot2 <- ggplot(data = pval_summ, aes(x = p_log10, y = n, color = Dataset, shape = Dataset, label = Drug_Gene)) +
facet_wrap(~ Measure, scales = "free_x") +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("green3", "darkslategray4", "darkturquoise")) +
geom_text_repel(data = filter(pval_summ, p_log10 > 5 & Measure == "Mutation Status"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_y = 10, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
geom_text_repel(data = filter(pval_summ, p_log10 > 20 & Measure == "Gene Expression"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_y = 10, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
geom_text_repel(data = filter(pval_summ, p_log10 > 7.5 & Measure == "Copy Number"), mapping = aes(x = p_log10, y = n, color = Dataset, label = Drug_Gene), direction = "y", nudge_y = 10, hjust = 0, size = 2.5, segment.size = 0.1, segment.colour = "grey50", box.padding = 0.6, show.legend = FALSE) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
labs(x = "-log10(p-value)", y = "Number of level A G2P associations") +
theme(legend.position = "bottom")
pval_summ_plot2
# ggsave("./plots/manuscript/all_pval_summ_plot2.png", pval_summ_plot2, device = "png", dpi = 450, width = 12, height = 6, units = "in")
Compare significant Pearson and Spearman correlations:
corr_summ_ge <- select(g2p_cor, Drug_Gene, n, Pearson_Pval_GE, Spearman_Pval_GE, InG2P, Dataset)
corr_summ_ge$Pearson_Pval_GE_log10 <- -log10(corr_summ_ge$Pearson_Pval_GE)
corr_summ_ge$Spearman_Pval_GE_log10 <- -log10(corr_summ_ge$Spearman_Pval_GE)
corr_summ_ge$Overlap_Status <- ifelse(corr_summ_ge$Pearson_Pval_GE < 0.05 & corr_summ_ge$Spearman_Pval_GE < 0.05, "Both", ifelse(corr_summ_ge$Pearson_Pval_GE < 0.05, "Pearson", ifelse(corr_summ_ge$Spearman_Pval_GE < 0.05, "Spearman", NA)))
corr_summ_ge$Overlap_Status_Bonf <- ifelse(corr_summ_ge$Pearson_Pval_GE < 0.05 / nrow(corr_summ_ge) & corr_summ_ge$Spearman_Pval_GE < 0.05 / nrow(corr_summ_ge), "Both", ifelse(corr_summ_ge$Pearson_Pval_GE < 0.05 / nrow(corr_summ_ge), "Pearson", ifelse(corr_summ_ge$Spearman_Pval_GE < 0.05 / nrow(corr_summ_ge), "Spearman", NA)))
corr_summ_ge %>% count(Overlap_Status)
## # A tibble: 4 x 2
## Overlap_Status nn
## <chr> <int>
## 1 Both 432
## 2 Pearson 162
## 3 Spearman 346
## 4 <NA> 998
corr_summ_ge %>% count(Overlap_Status_Bonf)
## # A tibble: 4 x 2
## Overlap_Status_Bonf nn
## <chr> <int>
## 1 Both 79
## 2 Pearson 48
## 3 Spearman 138
## 4 <NA> 1673
corr_summ_cn <- select(g2p_cor, Drug_Gene, n, Pearson_Pval_CN, Spearman_Pval_CN, InG2P, Dataset)
corr_summ_cn$Pearson_Pval_CN_log10 <- -log10(corr_summ_cn$Pearson_Pval_CN)
corr_summ_cn$Spearman_Pval_CN_log10 <- -log10(corr_summ_cn$Spearman_Pval_CN)
corr_summ_cn$Overlap_Status <- ifelse(corr_summ_cn$Pearson_Pval_CN < 0.05 & corr_summ_cn$Spearman_Pval_CN < 0.05, "Both", ifelse(corr_summ_cn$Pearson_Pval_CN < 0.05, "Pearson", ifelse(corr_summ_cn$Spearman_Pval_CN < 0.05, "Spearman", NA)))
corr_summ_cn$Overlap_Status_Bonf <- ifelse(corr_summ_cn$Pearson_Pval_CN < 0.05 / nrow(corr_summ_cn) & corr_summ_cn$Spearman_Pval_CN < 0.05 / nrow(corr_summ_cn), "Both", ifelse(corr_summ_cn$Pearson_Pval_CN < 0.05 / nrow(corr_summ_cn), "Pearson", ifelse(corr_summ_cn$Spearman_Pval_CN < 0.05 / nrow(corr_summ_cn), "Spearman", NA)))
corr_summ_cn %>% count(Overlap_Status)
## # A tibble: 4 x 2
## Overlap_Status nn
## <chr> <int>
## 1 Both 227
## 2 Pearson 126
## 3 Spearman 127
## 4 <NA> 1458
corr_summ_cn %>% count(Overlap_Status_Bonf)
## # A tibble: 4 x 2
## Overlap_Status_Bonf nn
## <chr> <int>
## 1 Both 6
## 2 Pearson 16
## 3 Spearman 16
## 4 <NA> 1900
# Mutation status
g2p_mut_all_summ_stats <- g2p_mut_all_summ
g2p_mut_all_summ_stats$n_true <- TRUE
g2p_mut_all_summ_stats <- g2p_mut_all_summ_stats %>% drop_na(p) %>% group_by(Dataset, InG2P) %>% summarize(p_mean = round(mean(p), 2), p_sd = round(sd(p), 2), n = sum(n_true), n_signif = sum(p < 0.05), n_sig_perc = n_signif / n * 100)
g2p_mut_all_summ_stats
## # A tibble: 6 x 7
## # Groups: Dataset [?]
## Dataset InG2P p_mean p_sd n n_signif n_sig_perc
## <chr> <chr> <dbl> <dbl> <int> <int> <dbl>
## 1 CCLE No 0.51 0.3 161 13 8.07
## 2 CCLE Yes 0.46 0.36 9 1 11.1
## 3 CTRP No 0.46 0.3 870 81 9.31
## 4 CTRP Yes 0.4 0.33 48 12 25
## 5 GDSC No 0.48 0.3 806 68 8.44
## 6 GDSC Yes 0.52 0.33 44 4 9.09
# Spearman correlations
g2p_spearman_stats <- g2p_cor
g2p_spearman_stats$n_true <- TRUE
g2p_spearman_stats <- g2p_spearman_stats %>% group_by(Dataset, InG2P) %>% drop_na() %>% summarize(
ge_mean = round(mean(Spearman_GE_Abs), 5),
ge_sd = round(sd(Spearman_GE_Abs), 5),
ge_n = sum(n_true),
ge_n_signif = sum(Spearman_Pval_GE < 0.05),
ge_n_sig_perc = ge_n_signif / ge_n * 100,
cn_mean = round(mean(Spearman_CN_Abs), 3),
cn_sd = round(sd(Spearman_CN_Abs), 5),
cn_n = sum(n_true),
cn_n_signif = sum(Spearman_Pval_CN < 0.05),
cn_n_sig_perc = cn_n_signif / cn_n * 100)
g2p_cor_stats <- merge(g2p_mut_all_summ_stats, g2p_spearman_stats, by = c("Dataset", "InG2P"))
g2p_cor_stats_kable <- knitr::kable(g2p_cor_stats, caption = "Summary of number of features with significant correlations with drug response, both in G2P and not") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Example p-value and Pearson correlation coefficient derivations for afatinib and ERBB2 in the CTRP dataset:
ctrp_grid_summary_data <- filter(ctrp_data_g2p_grid, Drug_Gene == "afatinib_ERBB2")
ctrp_grid_summary_mut_corr <- wilcox.test(AUC_zscore ~ Mutation_Status_Nonsilent, data = ctrp_grid_summary_data[!is.na(ctrp_grid_summary_data$Mutation_Status_Nonsilent),], paired = FALSE)
ctrp_grid_summary_mut_corr_text <- paste0("W = ", formatC(ctrp_grid_summary_mut_corr$statistic, format = "d", big.mark = ","), "\np = ", round(ctrp_grid_summary_mut_corr$p.value, 3))
ctrp_grid_summary_ge_corr <- cor.test(y = ctrp_grid_summary_data$AUC, x = ctrp_grid_summary_data$RPKM, method = "pearson", use = "complete.obs")
ctrp_grid_summary_ge_corr_text <- paste0("r(", ctrp_grid_summary_ge_corr$parameter, ") = ", formatC(ctrp_grid_summary_ge_corr$estimate, digits = 2, format = "f"), "\np ", ifelse(round(ctrp_grid_summary_ge_corr$p.value, 3) == 0, "< 0.001", paste0("= ", round(ctrp_grid_summary_ge_corr$p.value, 3))))
ctrp_grid_summary_cn_corr <- cor.test(y = ctrp_grid_summary_data$AUC, x = ctrp_grid_summary_data$CN, method = "pearson", use = "complete.obs")
ctrp_grid_summary_cn_corr_text <- paste0("r(", ctrp_grid_summary_cn_corr$parameter, ") = ", formatC(ctrp_grid_summary_cn_corr$estimate, digits = 2, format = "f"), "\np ", ifelse(round(ctrp_grid_summary_cn_corr$p.value, 3) == 0, "< 0.001", paste0("= ", round(ctrp_grid_summary_cn_corr$p.value, 3))))
ctrp_grid_summary_mut_plot <- ggplot(data = ctrp_grid_summary_data[!is.na(ctrp_grid_summary_data$Mutation_Status_Nonsilent),], aes(y = AUC_zscore, x = Mutation_Status_Nonsilent, fill = Mutation_Status_Nonsilent)) +
geom_boxplot(outlier.shape = NA, alpha = 0.6) +
geom_jitter(width = 0.2, shape = 21) +
scale_fill_manual(values = c("Wildtype" = "lightsteelblue1", "Mutant" = "steelblue4")) +
annotate(geom = "text", label = ctrp_grid_summary_mut_corr_text, x = 1.5, y = 1.3) +
guides(fill = FALSE) +
labs(x = "ERBB2 mutation status", y = "Afatinib AUC (z-score)") +
geom_segment(aes(x = 1, y = 1.5, xend = 1.1, yend = 1.5), lwd = 0.2) +
geom_segment(aes(x = 1.9, y = 1.5, xend = 2, yend = 1.5), lwd = 0.2) +
geom_segment(aes(x = 1, y = 1.5, xend = 1, yend = 1.35), lwd = 0.2) +
geom_segment(aes(x = 2, y = 1.5, xend = 2, yend = 1.35), lwd = 0.2)
ctrp_grid_summary_mut_plot
ctrp_grid_summary_ge_plot <- ggplot(data = ctrp_grid_summary_data, aes(y = AUC_zscore, x = RPKM)) +
geom_point(alpha = 0.25, size = 1.25, color = "gray4") +
geom_smooth(method = "lm", se = FALSE, lwd = 0.5, lty = 2) +
annotate(geom = "text", label = ctrp_grid_summary_ge_corr_text, x = 2000, y = 1.3) +
labs(x = "ERBB2 gene expression (RPKM)", y = "Afatinib AUC (z-score)") +
theme(axis.title.y = element_blank()) +
geom_curve(aes(x = 750, y = -1.9, xend = 1720, yend = 1.1), lwd = 0.2, curvature = -0.2, arrow = arrow(length = unit(0.03, "npc")))
ctrp_grid_summary_ge_plot
ctrp_grid_summary_cn_plot <- ggplot(data = ctrp_grid_summary_data, aes(y = AUC_zscore, x = CN)) +
geom_point(alpha = 0.25, size = 1.25, color = "gray4") +
geom_smooth(method = "lm", se = FALSE, lwd = 0.5, lty = 2) +
annotate(geom = "text", label = ctrp_grid_summary_cn_corr_text, x = 5, y = 1.3) +
labs(x = "ERBB2 copy number (log2 ratio)") +
theme(axis.title.y = element_blank()) +
geom_curve(aes(x = 4, y = -2.3, xend = 5, yend = 0.8), lwd = 0.2, curvature = 0.2, arrow = arrow(length = unit(0.03, "npc")))
ctrp_grid_summary_cn_plot
# ggsave("./plots/manuscript/ctrp_grid_summary_mut_plot.png", ctrp_grid_summary_mut_plot, device = "png", dpi = 450, width = 3, height = 4, units = "in")
# ggsave("./plots/manuscript/ctrp_grid_summary_ge_plot.png", ctrp_grid_summary_ge_plot, device = "png", dpi = 450, width = 5, height = 4, units = "in")
# ggsave("./plots/manuscript/ctrp_grid_summary_cn_plot.png", ctrp_grid_summary_cn_plot, device = "png", dpi = 450, width = 5, height = 4, units = "in")
Big grid plot by lineages:
dgl_mut <- dgl_mut_grid_summ
dgl_mut$Sign <- "None"
dgl_mut$Measure <- "Mutation Status"
colnames(dgl_mut)[colnames(dgl_mut) == "p"] <- "Statistic"
dgl_mut$Drug_Gene_Lin <- NULL
dgl_ge <- select(dgl_signif_pearson_g2p_grid, Drug_Gene, group_general_lineage_name, n, Pearson_GE_Abs, InG2P, Dataset, Pearson_GE_Sign)
dgl_ge$Measure <- "Gene Expression"
colnames(dgl_ge)[colnames(dgl_ge) == "Pearson_GE_Abs"] <- "Statistic"
colnames(dgl_ge)[colnames(dgl_ge) == "Pearson_GE_Sign"] <- "Sign"
dgl_cn <- select(dgl_signif_pearson_g2p_grid, Drug_Gene, group_general_lineage_name, n, Pearson_CN_Abs, InG2P, Dataset, Pearson_CN_Sign)
dgl_cn$Measure <- "Copy Number"
colnames(dgl_cn)[colnames(dgl_cn) == "Pearson_CN_Abs"] <- "Statistic"
colnames(dgl_cn)[colnames(dgl_cn) == "Pearson_CN_Sign"] <- "Sign"
dgl_all <- rbind(dgl_mut, dgl_ge, dgl_cn)
dgl_all$Sign <- ifelse(is.na(dgl_all$Sign), "None", dgl_all$Sign)
dgl_all_plot <- ggplot(data = dgl_all, aes(x = InG2P, y = Statistic)) +
facet_grid(group_general_lineage_name ~ Measure * Dataset) +
geom_jitter(mapping = aes(shape = Sign, fill = InG2P, color = InG2P), width = 0.4, alpha = 0.5, size = 1) +
geom_boxplot(width = 0.25, fill = NA, outlier.shape = NA) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_shape_manual(values = c(25, 24, 4), labels = c("Negative", "Positive", "NA")) +
theme(legend.position = "top") +
guides(color = FALSE, fill = FALSE)
# ggsave("./plots/manuscript/all_dgl_plot.png", dgl_all_plot, device = "png", dpi = 450, width = 12, height = 40, units = "in")
MakeCorrResSummaryTable <- function(df_list, dataset_name) {
res_full <- data.frame(matrix(ncol = 8, nrow = 0))
for(i in 1:length(df_list)) {
corr_res <- df_list[[i]] %>% group_by(G2P_DGL) %>% summarize(Dataset = dataset_name,
Drug_Gene = names(df_list)[i],
Lineages = paste(as.character(unique(group_general_lineage_name)), collapse = ", "),
N_Lineages = length(as.character(unique(group_general_lineage_name))),
Pearson_Corr_GE = cor.test(y = AUC_zscore,
x = RPKM, method = "pearson",
use = "complete.obs")$estimate,
Pearson_Corr_CN = cor.test(y = AUC_zscore,
x = CN, method = "pearson",
use = "complete.obs")$estimate,
Spearman_Corr_GE = cor.test(y = AUC_zscore,
x = RPKM, method = "spearman",
use = "complete.obs")$estimate,
Spearman_Corr_CN = cor.test(y = AUC_zscore,
x = CN, method = "spearman",
use = "complete.obs")$estimate)
corr_res <- merge(corr_res, compare_means(AUC_zscore ~ Mutation_Status_Nonsilent, data = df_list[[i]], group.by = "G2P_DGL", method = "wilcox.test")[, c("G2P_DGL", "p")], by = "G2P_DGL", all = TRUE)
corr_res$Mut_Signif <- ifelse(corr_res$p < 0.05, 1, 0)
res_full <- rbind(res_full, corr_res)
}
return(res_full)
}
G2PCorr_KStest <- function(dataset, dataset_name) {
Metric <- colnames(dataset)[6:9]
D_stat <- NULL
pval <- NULL
signif <- NULL
yes <- dataset[dataset$G2P_DGL == "Yes",]
no <- dataset[dataset$G2P_DGL == "No",]
for(i in 6:9) {
# print(yes[[colnames(yes)[i]]])
# print(no[[colnames(no)[i]]])
test <- ks.test(yes[[colnames(yes)[i]]], no[[colnames(no)[i]]])
D_stat <- c(D_stat, test$statistic)
pval <- c(pval, test$p.value)
signif <- c(signif, ifelse(test$p.value < 0.001, "***", ifelse(test$p.value < 0.01, "**", ifelse(test$p.value < 0.05, "*", NA))))
}
df <- data.frame(Metric, D_stat, pval, signif)
df$Dataset <- dataset_name
return(df)
}
# cannot compute exact p-value with ties??????
ccle_data_g2p$G2P_DGL <- ifelse(ccle_data_g2p$Drug_Gene_Lin %in% g2p_dgls, "Yes", "No")
ccle_data_g2p_split <- split(ccle_data_g2p, f = ccle_data_g2p$Drug_Gene)
ccle_corr_all <- MakeCorrResSummaryTable(ccle_data_g2p_split, dataset_name = "CCLE")
ctrp_data_g2p$G2P_DGL <- ifelse(ctrp_data_g2p$Drug_Gene_Lin %in% g2p_dgls, "Yes", "No")
ctrp_data_g2p_split <- split(ctrp_data_g2p, f = ctrp_data_g2p$Drug_Gene)
ctrp_corr_all <- MakeCorrResSummaryTable(ctrp_data_g2p_split, dataset_name = "CTRP")
gdsc_data_g2p$G2P_DGL <- ifelse(gdsc_data_g2p$Drug_Gene_Lin %in% g2p_dgls, "Yes", "No")
gdsc_data_g2p_split <- split(gdsc_data_g2p, f = gdsc_data_g2p$Drug_Gene)
gdsc_corr_all <- MakeCorrResSummaryTable(gdsc_data_g2p_split, dataset_name = "GDSC")
g2p_corr_all <- rbind(ccle_corr_all, ctrp_corr_all, gdsc_corr_all)
ccle_corr_stats <- G2PCorr_KStest(ccle_corr_all, "CCLE")
ctrp_corr_stats <- G2PCorr_KStest(ctrp_corr_all, "CTRP")
gdsc_corr_stats <- G2PCorr_KStest(gdsc_corr_all, "GDSC")
g2p_corr_stats <- rbind(ccle_corr_stats, ctrp_corr_stats, gdsc_corr_stats)
g2p_corr_stats <- g2p_corr_stats %>% separate(col = Metric, into = c("Correlation_Method", "corr", "Metric"), sep = "_")
g2p_corr_stats$Metric <- ifelse(g2p_corr_stats$Metric == "GE", "Gene Expression", "Copy Number")
g2p_corr_stats
## Correlation_Method corr Metric D_stat pval signif
## 1 Pearson Corr Gene Expression 0.4444444 0.351707116 <NA>
## 2 Pearson Corr Copy Number 0.3333333 0.730111065 <NA>
## 3 Spearman Corr Gene Expression 0.3333333 0.730111065 <NA>
## 4 Spearman Corr Copy Number 0.3333333 0.730111065 <NA>
## 5 Pearson Corr Gene Expression 0.2500000 0.099563966 <NA>
## 6 Pearson Corr Copy Number 0.2500000 0.099563966 <NA>
## 7 Spearman Corr Gene Expression 0.2708333 0.058763021 <NA>
## 8 Spearman Corr Copy Number 0.3750000 0.002341759 **
## 9 Pearson Corr Gene Expression 0.2727273 0.075539610 <NA>
## 10 Pearson Corr Copy Number 0.2272727 0.205836238 <NA>
## 11 Spearman Corr Gene Expression 0.2500000 0.128169305 <NA>
## 12 Spearman Corr Copy Number 0.2500000 0.127822319 <NA>
## Dataset
## 1 CCLE
## 2 CCLE
## 3 CCLE
## 4 CCLE
## 5 CTRP
## 6 CTRP
## 7 CTRP
## 8 CTRP
## 9 GDSC
## 10 GDSC
## 11 GDSC
## 12 GDSC
g2p_mut_stats <- g2p_corr_all %>% group_by(Dataset, G2P_DGL) %>% drop_na(Mut_Signif) %>% summarize(N_Signif_Muts = sum(Mut_Signif),
Total_Tests = length(Mut_Signif),
Perc_Signif = N_Signif_Muts / Total_Tests * 100)
g2p_mut_stats
## # A tibble: 6 x 5
## # Groups: Dataset [?]
## Dataset G2P_DGL N_Signif_Muts Total_Tests Perc_Signif
## <chr> <chr> <dbl> <int> <dbl>
## 1 CCLE No 1 9 11.1
## 2 CCLE Yes 1 7 14.3
## 3 CTRP No 7 48 14.6
## 4 CTRP Yes 5 38 13.2
## 5 GDSC No 3 44 6.82
## 6 GDSC Yes 3 34 8.82
Plot of Kolmogorov-Smirnov test results comparing correlations between CN/GE and AUC z-score in G2P and non-G2P lineages:
g2p_corr_4plot <- g2p_corr_all
g2p_corr_4plot$p <- NULL
g2p_corr_4plot$Mut_Signif <- NULL
g2p_corr_4plot <- melt(g2p_corr_4plot, id.vars = colnames(g2p_corr_4plot)[1:5], measure.vars = colnames(g2p_corr_4plot)[6:9], value.name = "Correlation")
g2p_corr_4plot <- g2p_corr_4plot %>% separate(col = variable, into = c("Correlation_Method", "corr", "Metric"), sep = "_", remove = FALSE)
g2p_corr_4plot$Metric <- ifelse(g2p_corr_4plot$Metric == "GE", "Gene Expression", "Copy Number")
g2p_ks_res_plot <- ggplot(data = g2p_corr_4plot) +
facet_grid(Correlation_Method ~ Metric * Dataset) +
geom_violin(aes(y = Correlation, x = G2P_DGL, fill = G2P_DGL, color = G2P_DGL), alpha = 0.3) +
geom_boxplot(width = 0.075, aes(y = Correlation, x = G2P_DGL, color = G2P_DGL)) +
geom_jitter(width = 0.3, alpha = 0.7, aes(y = Correlation, x = G2P_DGL, color = G2P_DGL)) +
theme(legend.position = "none") +
coord_cartesian(ylim = c(-1, 1)) +
geom_text(data = g2p_corr_stats, aes(x = 1.5, y = 0.8, label = paste0("D = ", formatC(D_stat, digits = 2, format = "f"), ", p = ", ifelse(pval > 0.1, round(pval, 2), round(pval, 3)), "\n", ifelse(is.na(signif), "", signif)))) +
labs(x = "Is the lineage in a G2P DGL?", y = "Correlation Coefficint", subtitle = "Kolmogorov-Smirnov test results comparing correlations between CN/GE and AUC z-score in G2P and non-G2P lineages.")
g2p_ks_res_plot
# ggsave("./plots/manuscript/g2p_ks_res_plot.png", g2p_ks_res_plot, device = "png", dpi = 450, width = 12, height = 6, units = "in")
g2p_ks_res_plot_spearman <- ggplot(data = filter(g2p_corr_4plot, Correlation_Method == "Spearman")) +
facet_grid(Metric ~ Dataset, labeller = labeller(yfacet = c('CN' = "Copy Number", 'GE' = "Gene Expression"))) +
geom_violin(aes(y = Correlation, x = G2P_DGL, fill = Dataset, color = Dataset, alpha = G2P_DGL)) +
geom_boxplot(width = 0.075, aes(y = Correlation, x = G2P_DGL, color = Dataset)) +
geom_jitter(width = 0.3, alpha = 0.8, aes(y = Correlation, x = G2P_DGL, color = Dataset)) +
theme(legend.position = "none") +
scale_alpha_manual(values = c("Yes" = 0.6, "No" = 0.2)) +
scale_fill_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
scale_color_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
coord_cartesian(ylim = c(-1, 1)) +
geom_text(data = filter(g2p_corr_stats, Correlation_Method == "Spearman"), aes(x = 1.5, y = 0.8, label = paste0("D = ", formatC(D_stat, digits = 2, format = "f"), ", p = ", ifelse(pval > 0.1, round(pval, 2), round(pval, 3)), "\n", ifelse(is.na(signif), "", signif)))) +
labs(x = "Is the lineage in a G2P DGL?", y = "Spearman Correlation Coefficient")
g2p_ks_res_plot_spearman
# ggsave("./plots/manuscript/g2p_spearman_ks_res_plot.pdf", g2p_ks_res_plot_spearman, device = "pdf", dpi = 500, width = 12, height = 6, units = "in")
pancan_ge <- select(filter(all_pearson_res, Metric == "Gene Expression"), Drug_Gene, p, Dataset, p_log10)
colnames(pancan_ge)[colnames(pancan_ge) == "p"] <- "p_pancan"
colnames(pancan_ge)[colnames(pancan_ge) == "p_log10"] <- "p_log10_pancan"
compare_ge <- select(dgl_signif_pearson_g2p_grid, Drug_Gene, group_general_lineage_name, n, Pearson_Pval_GE, Drug_Gene_Lin, InG2P, Dataset)
colnames(compare_ge)[colnames(compare_ge) == "Pearson_Pval_GE"] <- "p"
compare_ge$p_log10 <- -log10(compare_ge$p)
compare_ge <- merge(compare_ge, pancan_ge, by = c("Drug_Gene", "Dataset"), all = TRUE)
compare_ge$Compare_Pvals <- ifelse(compare_ge$p < compare_ge$p_pancan, "lineage more sig", "pancan more sig")
compare_ge$Metric <- "Gene Expression"
pancan_cn <- select(filter(all_pearson_res, Metric == "Copy Number"), Drug_Gene, p, Dataset, p_log10)
colnames(pancan_cn)[colnames(pancan_cn) == "p"] <- "p_pancan"
colnames(pancan_cn)[colnames(pancan_cn) == "p_log10"] <- "p_log10_pancan"
compare_cn <- select(dgl_signif_pearson_g2p_grid, Drug_Gene, group_general_lineage_name, n, Pearson_Pval_CN, Drug_Gene_Lin, InG2P, Dataset)
colnames(compare_cn)[colnames(compare_cn) == "Pearson_Pval_CN"] <- "p"
compare_cn$p_log10 <- -log10(compare_cn$p)
compare_cn <- merge(compare_cn, pancan_cn, by = c("Drug_Gene", "Dataset"), all = TRUE)
compare_cn$Compare_Pvals <- ifelse(compare_cn$p < compare_cn$p_pancan, "lineage more sig", "pancan more sig")
compare_cn$Metric <- "Copy Number"
pancan_mut <- all_mut_res
pancan_mut$Metric <- NULL
pancan_mut$corr <- NULL
colnames(pancan_mut)[colnames(pancan_mut) == "p"] <- "p_pancan"
colnames(pancan_mut)[colnames(pancan_mut) == "p_log10"] <- "p_log10_pancan"
compare_mut <- dgl_mut_grid_summ
compare_mut$p_log10 <- -log10(compare_mut$p)
compare_mut <- merge(compare_mut, pancan_mut, by = c("Drug_Gene", "Dataset"), all = TRUE)
compare_mut$Compare_Pvals <- ifelse(compare_mut$p < compare_mut$p_pancan, "lineage more sig", "pancan more sig")
compare_mut$Metric <- "Mutation Status"
compare_data <- filter(rbind(compare_ge, compare_cn, compare_mut), Drug_Gene %in% g2p_druggenes)
compare_data$Compare_Pvals <- ifelse(is.na(compare_data$Compare_Pvals), "in G2P but not measured", compare_data$Compare_Pvals)
# Add grid entries for counting non-G2P drug-gene associations
# compare_grid <- expand.grid(g2p_druggenes, unique(ccl_converter$group_general_lineage_name), c("CCLE", "CTRP", "GDSC"), c("Gene Expression", "Copy Number", "Mutation Status"))
# colnames(compare_grid) <- c("Drug_Gene", "group_general_lineage_name", "Dataset", "Metric")
# compare_data <- merge(compare_grid, compare_data, by = c("Drug_Gene", "group_general_lineage_name", "Dataset", "Metric"), all.x = TRUE)
# compare_data$Compare_Pvals <- ifelse(is.na(compare_data$Compare_Pvals), "not in G2P", compare_data$Compare_Pvals)
compare_data_stats <- compare_data %>% count(Dataset, Metric, Compare_Pvals)
compare_data_stats_kable <- knitr::kable(compare_data_stats, caption = "Comparison of lineage-specific and pancancer p-values for G2P drug-gene associations") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
compare_mut_plot <- ggplot(data = filter(compare_data, Metric == "Mutation Status"), aes(x = group_general_lineage_name, y = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(aes(shape = Compare_Pvals, fill = p_log10), color = "gray50") +
scale_fill_gradient2(low = "white", mid = "white", high = "darkorchid4", midpoint = 1.3010299957) +
scale_shape_manual(values = c(25, 24), labels = c("Below", "Above")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4), legend.position = "top") +
labs(title = "Mutation Status",
subtitle = paste0("Above: ", filter(compare_data_stats, Metric == "Mutation Status" & Compare_Pvals == "Above")$nn, "\n",
"Under: ", filter(compare_data_stats, Metric == "Mutation Status" & Compare_Pvals == "Under")$nn, "\n",
"Not tested: ", filter(compare_data_stats, Metric == "Mutation Status" & is.na(Compare_Pvals))$nn),
x = "Lineage", y = "Drug-Gene", shape = "Compare linege p-value to pancancer p-value:", fill = "-log10(p)")
# ggsave("./plots/manuscript/compare_mut_plot.png", compare_mut_plot, device = "png", dpi = 450, width = 12, height = 12, units = "in")
compare_ge_plot <- ggplot(data = filter(compare_data, Metric == "Gene Expression"), aes(x = group_general_lineage_name, y = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(aes(shape = Compare_Pvals, fill = p_log10), color = "gray50") +
scale_fill_gradient2(low = "white", mid = "white", high = "darkorchid4", midpoint = 1.3010299957) +
scale_shape_manual(values = c(25, 24), labels = c("Below", "Above")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4), legend.position = "top") +
labs(title = "Gene Expression",
subtitle = paste0("Above: ", filter(compare_data_stats, Metric == "Gene Expression" & Compare_Pvals == "Above")$nn, "\n",
"Under: ", filter(compare_data_stats, Metric == "Gene Expression" & Compare_Pvals == "Under")$nn, "\n",
"Not tested: ", filter(compare_data_stats, Metric == "Gene Expression" & is.na(Compare_Pvals))$nn),
x = "Lineage", y = "Drug-Gene", shape = "Compare linege p-value to pancancer p-value:", fill = "-log10(p)")
# ggsave("./plots/manuscript/compare_ge_plot.png", compare_ge_plot, device = "png", dpi = 450, width = 12, height = 12, units = "in")
compare_cn_plot <- ggplot(data = filter(compare_data, Metric == "Copy Number"), aes(x = group_general_lineage_name, y = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(aes(shape = Compare_Pvals, fill = p_log10), color = "gray50") +
scale_fill_gradient2(low = "white", mid = "white", high = "darkorchid4", midpoint = 1.3010299957) +
scale_shape_manual(values = c(25, 24), labels = c("Below", "Above")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4), legend.position = "top") +
labs(title = "Copy Number",
subtitle = paste0("Above: ", filter(compare_data_stats, Metric == "Copy Number" & Compare_Pvals == "Above")$nn, "\n",
"Under: ", filter(compare_data_stats, Metric == "Copy Number" & Compare_Pvals == "Under")$nn, "\n",
"Not tested: ", filter(compare_data_stats, Metric == "Copy Number" & is.na(Compare_Pvals))$nn),
x = "Lineage", y = "Drug-Gene", shape = "Compare linege p-value to pancancer p-value:", fill = "-log10(p)")
# ggsave("./plots/manuscript/compare_cn_plot.png", compare_cn_plot, device = "png", dpi = 450, width = 12, height = 12, units = "in")
ccle_pearson_ttest_GE <- t.test(ccle_cor_res$Pearson_Corr_GE ~ ccle_cor_res$InG2P)
ccle_pearson_ttest_res_GE <- paste0("t(", round(ccle_pearson_ttest_GE$parameter, 0), ") = ", round(ccle_pearson_ttest_GE$statistic, 2), ", p = ", round(ccle_pearson_ttest_GE$p.value, 2))
ccle_pearson_ttest_CN <- t.test(ccle_cor_res$Pearson_Corr_CN ~ ccle_cor_res$InG2P)
ccle_pearson_ttest_res_CN <- paste0("t(", round(ccle_pearson_ttest_CN$parameter, 0), ") = ", round(ccle_pearson_ttest_CN$statistic, 2), ", p = ", round(ccle_pearson_ttest_CN$p.value, 2))
ccle_spearman_ttest_GE <- t.test(ccle_cor_res$Spearman_Corr_GE ~ ccle_cor_res$InG2P)
ccle_spearman_ttest_res_GE <- paste0("t(", round(ccle_spearman_ttest_GE$parameter, 0), ") = ", round(ccle_spearman_ttest_GE$statistic, 2), ", p = ", round(ccle_spearman_ttest_GE$p.value, 2))
ccle_spearman_ttest_CN <- t.test(ccle_cor_res$Spearman_Corr_CN ~ ccle_cor_res$InG2P)
ccle_spearman_ttest_res_CN <- paste0("t(", round(ccle_spearman_ttest_CN$parameter, 0), ") = ", round(ccle_spearman_ttest_CN$statistic, 2), ", p = ", formatC(ccle_spearman_ttest_CN$p.value, 2, format = "f"))
ctrp_pearson_ttest_GE <- t.test(ctrp_cor_res$Pearson_Corr_GE ~ ctrp_cor_res$InG2P)
ctrp_pearson_ttest_res_GE <- paste0("t(", round(ctrp_pearson_ttest_GE$parameter, 0), ") = ", round(ctrp_pearson_ttest_GE$statistic, 2), ", p ", ifelse(round(ctrp_pearson_ttest_GE$p.value, 3) == 0, "< 0.001", paste0("= ", round(ctrp_pearson_ttest_GE$p.value, 3))))
ctrp_pearson_ttest_CN <- t.test(ctrp_cor_res$Pearson_Corr_CN ~ ctrp_cor_res$InG2P)
ctrp_pearson_ttest_res_CN <- paste0("t(", round(ctrp_pearson_ttest_CN$parameter, 0), ") = ", round(ctrp_pearson_ttest_CN$statistic, 2), ", p = ", round(ctrp_pearson_ttest_CN$p.value, 3))
ctrp_spearman_ttest_GE <- t.test(ctrp_cor_res$Spearman_Corr_GE ~ ctrp_cor_res$InG2P)
ctrp_spearman_ttest_res_GE <- paste0("t(", round(ctrp_spearman_ttest_GE$parameter, 0), ") = ", round(ctrp_spearman_ttest_GE$statistic, 2), ", p = ", round(ctrp_spearman_ttest_GE$p.value, 2))
ctrp_spearman_ttest_CN <- t.test(ctrp_cor_res$Spearman_Corr_CN ~ ctrp_cor_res$InG2P)
ctrp_spearman_ttest_res_CN <- paste0("t(", round(ctrp_spearman_ttest_CN$parameter, 0), ") = ", round(ctrp_spearman_ttest_CN$statistic, 2), ", p = ", round(ctrp_spearman_ttest_CN$p.value, 2))
gdsc_pearson_ttest_GE <- t.test(gdsc_cor_res$Pearson_Corr_GE ~ gdsc_cor_res$InG2P)
gdsc_pearson_ttest_res_GE <- paste0("t(", round(gdsc_pearson_ttest_GE$parameter, 0), ") = ", round(gdsc_pearson_ttest_GE$statistic, 2), ", p = ", round(gdsc_pearson_ttest_GE$p.value, 3))
gdsc_pearson_ttest_CN <- t.test(gdsc_cor_res$Pearson_Corr_CN ~ gdsc_cor_res$InG2P)
gdsc_pearson_ttest_res_CN <- paste0("t(", round(gdsc_pearson_ttest_CN$parameter, 0), ") = ", round(gdsc_pearson_ttest_CN$statistic, 2), ", p = ", round(gdsc_pearson_ttest_CN$p.value, 3))
gdsc_spearman_ttest_GE <- t.test(gdsc_cor_res$Spearman_Corr_GE ~ gdsc_cor_res$InG2P)
gdsc_spearman_ttest_res_GE <- paste0("t(", round(gdsc_spearman_ttest_GE$parameter, 0), ") = ", round(gdsc_spearman_ttest_GE$statistic, 2), ", p = ", round(gdsc_spearman_ttest_GE$p.value, 2))
gdsc_spearman_ttest_CN <- t.test(gdsc_cor_res$Spearman_Corr_CN ~ gdsc_cor_res$InG2P)
gdsc_spearman_ttest_res_CN <- paste0("t(", round(gdsc_spearman_ttest_CN$parameter, 0), ") = ", round(gdsc_spearman_ttest_CN$statistic, 2), ", p = ", round(gdsc_spearman_ttest_CN$p.value, 2))
g2p_cor_text <- data.frame(Dataset = c("CCLE", "CTRP", "GDSC"), Pearson_Test_Results_GE = c(ccle_pearson_ttest_res_GE, ctrp_pearson_ttest_res_GE, gdsc_pearson_ttest_res_GE), Pearson_Test_Results_CN = c(ccle_pearson_ttest_res_CN, ctrp_pearson_ttest_res_CN, gdsc_pearson_ttest_res_CN), Spearman_Test_Results_GE = c(ccle_spearman_ttest_res_GE, ctrp_spearman_ttest_res_GE, gdsc_spearman_ttest_res_GE), Spearman_Test_Results_CN = c(ccle_spearman_ttest_res_CN, ctrp_spearman_ttest_res_CN, gdsc_spearman_ttest_res_CN))
CRISPR mutation status plots of p-values:
crispr_signif_g2p_gene$y_label <- "CRISPR"
crispr_signif_g2p_gene$Hugo_Symbol <- factor(crispr_signif_g2p_gene$Hugo_Symbol, levels = g2p_genes)
crispr_signif_g2p_gene$p <- as.numeric(as.character(crispr_signif_g2p_gene$p))
crispr_mut_heatmap_plot <- ggplot(data = crispr_signif_g2p_gene, aes(x = Hugo_Symbol, y = y_label)) +
geom_tile(aes(fill = p)) +
scale_fill_gradientn(colors = c("darkgreen", "honeydew", "white"), values = c(0, 0.05, 1), na.value = "darksalmon") +
scale_x_discrete(drop = FALSE) +
theme(axis.title.y = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 10)) +
labs(x = "Gene")
crispr_mut_heatmap_plot
# write.table(crispr_signif_g2p_gene, file = "~/Desktop/crispr_signif_g2p_gene_20181203.csv", quote = FALSE, sep = ",", row.names = FALSE)
# ggsave("./plots/manuscript/crispr_mut_heatmap_plot_example.png", crispr_mut_heatmap_plot, device = "png", dpi = 450, width = 12, height = 1.5, units = "in")
ccle_mut_summ <- merge(g2p_druggene_tally, ccle_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
ccle_mut_summ$n <- ifelse(is.na(ccle_mut_summ$n), 0, ccle_mut_summ$n)
ccle_mut_summ$InG2P <- ifelse(ccle_mut_summ$n != 0, "Yes", "No")
ccle_mut_summ$Dataset <- "CCLE"
ccle_p_ttest <- t.test(ccle_mut_summ$p ~ ccle_mut_summ$InG2P)
ccle_p_ttest_res <- paste0("t(", round(ccle_p_ttest$parameter, 0), ") = ", round(ccle_p_ttest$statistic, 2), ", p = ", round(ccle_p_ttest$p.value, 2))
ctrp_mut_summ <- merge(g2p_druggene_tally, ctrp_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
ctrp_mut_summ$n <- ifelse(is.na(ctrp_mut_summ$n), 0, ctrp_mut_summ$n)
ctrp_mut_summ$InG2P <- ifelse(ctrp_mut_summ$n != 0, "Yes", "No")
ctrp_mut_summ$Dataset <- "CTRP"
ctrp_p_ttest <- t.test(ctrp_mut_summ$p ~ ctrp_mut_summ$InG2P)
ctrp_p_ttest_res <- paste0("t(", round(ctrp_p_ttest$parameter, 0), ") = ", round(ctrp_p_ttest$statistic, 2), ", p = ", formatC(ctrp_p_ttest$p.value, digits = 2, format = "f"))
gdsc_mut_summ <- merge(g2p_druggene_tally, gdsc_signif_g2p_grid[, c("Drug_Gene", "p")], by = "Drug_Gene", all = TRUE)
gdsc_mut_summ$n <- ifelse(is.na(gdsc_mut_summ$n), 0, gdsc_mut_summ$n)
gdsc_mut_summ$InG2P <- ifelse(gdsc_mut_summ$n != 0, "Yes", "No")
gdsc_mut_summ$Dataset <- "GDSC"
gdsc_p_ttest <- t.test(gdsc_mut_summ$p ~ gdsc_mut_summ$InG2P)
gdsc_p_ttest_res <- paste0("t(", round(gdsc_p_ttest$parameter, 0), ") = ", round(gdsc_p_ttest$statistic, 2), ", p = ", formatC(gdsc_p_ttest$p.value, digits = 2, format = "f"))
g2p_mut_all_summ <- rbind(ccle_mut_summ, ctrp_mut_summ, gdsc_mut_summ)
g2p_mut_all_summ$p_log10 <- -log10(g2p_mut_all_summ$p)
g2p_mut_all_summ_text <- data.frame(Dataset = c("CCLE", "CTRP", "GDSC"), Test_Results_p = c(ccle_p_ttest_res, ctrp_p_ttest_res, gdsc_p_ttest_res))
g2p_mut_all_summ$Drug_Gene_Ordered <- factor(g2p_mut_all_summ$Drug_Gene, levels = as.character(unique(g2p_mut_all_summ$Drug_Gene)), labels = sapply(strsplit(x = as.character(unique(g2p_mut_all_summ$Drug_Gene)), split = "_"), function(x) paste0(x[1], "\n", x[2])))
plot_p_point <- ggplot(data = g2p_mut_all_summ, aes(x = p, y = n, label = Drug_Gene)) +
facet_wrap(~ Dataset, scales = "free_x") +
geom_point(mapping = aes(color = InG2P), alpha = 0.5) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
geom_text_repel(data = subset(g2p_mut_all_summ, InG2P == "Yes" & n > 10), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = g2p_mut_all_summ_text, mapping = aes(x = 0.5, y = 115, label = Test_Results_p)) +
labs(x = "Gene-drug association p-value between mutation groups", y = "Number of level A G2P associations") +
theme(legend.position = "none", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank())
plot_p_box <- ggplot(data = g2p_mut_all_summ) +
facet_wrap(~ Dataset, scales = "free_x") +
geom_violin(mapping = aes(color = InG2P, fill = InG2P, y = p, x = InG2P), alpha = 0.35) +
geom_boxplot(mapping = aes(color = InG2P, y = p, x = InG2P), width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_p_point <- ggplot_gtable(ggplot_build(plot_p_point))
plot_p_box <- ggplot_gtable(ggplot_build(plot_p_box))
p_maxWidth <- grid::unit.pmax(plot_p_point$widths, plot_p_box$widths)
plot_p_point$widths <- as.list(p_maxWidth)
plot_p_box$widths <- as.list(p_maxWidth)
g2p_mut_plot <- cowplot::plot_grid(plot_p_box,
plot_p_point,
nrow = 2,
rel_heights = c(1, 4))
g2p_mut_plot
# ggsave("./plots/manuscript/g2p_mut_plot.png", g2p_mut_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
plot_p_point <- ggplot(data = g2p_mut_all_summ) +
facet_wrap(~ Dataset, scales = "free_x") +
geom_point(mapping = aes(label = Drug_Gene, color = InG2P, x = p, y = n), alpha = 0.5) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
geom_text(data = g2p_mut_all_summ_text, mapping = aes(x = 0.5, y = 115, label = Test_Results_p)) +
labs(x = "Gene-drug association p-value between mutation groups", y = "Number of level A G2P associations") +
theme(legend.position = "none", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank())
plot_p_box <- ggplot(data = g2p_mut_all_summ) +
facet_wrap(~ Dataset, scales = "free_x") +
geom_violin(mapping = aes(color = InG2P, y = p, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = p, x = InG2P), alpha = 0.75, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "Level A G2P\nassociation") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot <- subplot(plot_p_box, plot_p_point, nrows = 2, titleX = TRUE, titleY = TRUE) %>% layout(margin = list(l = 75), xaxis5 = list(title = "Gene-drug association p-value between mutation groups"), yaxis = list(title = "In G2P?", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
saveWidgetFix(widget = plot, file = "./plots/manuscript/g2p_mut_plot.html")
plot_ge_pearson_point <- ggplot(data = g2p_cor, aes(x = Pearson_GE_Abs, y = n, label = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(fill = InG2P, color = InG2P, shape = Pearson_GE_Sign), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_cartesian(xlim = c(0, 0.65)) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
geom_text_repel(data = subset(g2p_cor, InG2P == "Yes" & n > 10), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.325, y = 115, label = Pearson_Test_Results_GE)) +
labs(x = "Pearson correlation coefficient (r) between drug-gene AUC and gene expression", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE, shape = FALSE)
plot_ge_pearson_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, fill = InG2P, y = Pearson_GE_Abs, x = InG2P), alpha = 0.35) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_GE_Abs, x = InG2P), width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_flip(ylim = c(0, 0.65)) +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_ge_pearson_point <- ggplot_gtable(ggplot_build(plot_ge_pearson_point))
plot_ge_pearson_box <- ggplot_gtable(ggplot_build(plot_ge_pearson_box))
p_maxWidth <- grid::unit.pmax(plot_ge_pearson_point$widths[2:3], plot_ge_pearson_box$widths[2:3])
plot_ge_pearson_point$widths[2:3] <- p_maxWidth
plot_ge_pearson_box$widths[2:3] <- p_maxWidth
g2p_ge_pearson_plot <- cowplot::plot_grid(plot_ge_pearson_box, plot_ge_pearson_point, nrow = 2, rel_heights = c(1, 4))
g2p_ge_pearson_plot
# ggsave("./plots/manuscript/g2p_ge_pearson_plot.png", g2p_ge_pearson_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
plot_ge_point <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(label = Drug_Gene, fill = InG2P, color = InG2P, shape = Pearson_GE_Sign, x = Pearson_GE_Abs, y = n, size = Pearson_Pval_GE), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = pretty_breaks()) +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.3, y = 115, label = Pearson_Test_Results_GE)) +
labs(x = "Pearson correlation coefficient (r) between drug-gene AUC and gene expression", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE) +
labs(shape = NULL)
plot_ge_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, y = Pearson_GE_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_GE_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
ge_pearson_plot <- subplot(plot_ge_box, plot_ge_point, nrows = 2, heights = c(0.2, 0.8), titleX = TRUE, titleY = TRUE) %>% layout(margin = list(l = 75), xaxis5 = list(title = "Pearson correlation coefficient between drug-gene AUC and gene expression"), yaxis = list(title = "In G2P?", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
saveWidgetFix(widget = ge_pearson_plot, file = "./plots/manuscript/g2p_ge_pearson_plot.html")
plot_ge_spearman_point <- ggplot(data = g2p_cor, aes(x = Spearman_GE_Abs, y = n, label = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(fill = InG2P, color = InG2P, shape = Spearman_GE_Sign), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_cartesian(xlim = c(0, 0.6)) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
geom_text_repel(data = subset(g2p_cor, InG2P == "Yes" & n > 10), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.3, y = 115, label = Spearman_Test_Results_GE)) +
labs(x = "Spearman correlation coefficient (r_s) between drug-gene AUC and gene expression", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE, shape = FALSE)
plot_ge_spearman_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, fill = InG2P, y = Spearman_GE_Abs, x = InG2P), alpha = 0.35) +
geom_boxplot(mapping = aes(color = InG2P, y = Spearman_GE_Abs, x = InG2P), width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_flip(ylim = c(0, 0.6)) +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_ge_spearman_point <- ggplot_gtable(ggplot_build(plot_ge_spearman_point))
plot_ge_spearman_box <- ggplot_gtable(ggplot_build(plot_ge_spearman_box))
p_maxWidth <- grid::unit.pmax(plot_ge_spearman_point$widths[2:3], plot_ge_spearman_box$widths[2:3])
plot_ge_spearman_point$widths[2:3] <- p_maxWidth
plot_ge_spearman_box$widths[2:3] <- p_maxWidth
g2p_ge_spearman_plot <- cowplot::plot_grid(plot_ge_spearman_box, plot_ge_spearman_point, nrow = 2, rel_heights = c(1, 4))
g2p_ge_spearman_plot
# ggsave("./plots/manuscript/g2p_ge_spearman_plot.png", g2p_ge_spearman_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
plot_ge_point <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(label = Drug_Gene, fill = InG2P, color = InG2P, shape = Spearman_GE_Sign, x = Spearman_GE_Abs, y = n, size = Spearman_Pval_GE), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = pretty_breaks()) +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.3, y = 115, label = Spearman_Test_Results_GE)) +
labs(x = "Spearman correlation coefficient (r_s) between drug-gene AUC and gene expression", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE) +
labs(shape = NULL)
plot_ge_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, y = Spearman_GE_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Spearman_GE_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
ge_spearman_plot <- subplot(plot_ge_box, plot_ge_point, nrows = 2, heights = c(0.2, 0.8), titleX = TRUE, titleY = TRUE) %>% layout(margin = list(l = 75), xaxis5 = list(title = "Spearman correlation coefficient between drug-gene AUC and gene expression"), yaxis = list(title = "In G2P?", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
saveWidgetFix(widget = ge_spearman_plot, file = "./plots/manuscript/g2p_ge_spearman_plot.html")
plot_cn_pearson_point <- ggplot(data = g2p_cor, aes(x = Pearson_CN_Abs, y = n, label = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(fill = InG2P, color = InG2P, shape = Pearson_CN_Sign), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_cartesian(xlim = c(0, 0.5)) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
geom_text_repel(data = subset(g2p_cor, InG2P == "Yes" & n > 10), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.25, y = 115, label = Pearson_Test_Results_CN)) +
labs(x = "Pearson correlation coefficient (r) between drug-gene AUC and copy number", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE, shape = FALSE)
plot_cn_pearson_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, fill = InG2P, y = Pearson_CN_Abs, x = InG2P), alpha = 0.35) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_CN_Abs, x = InG2P), width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 0.65, by = 0.1), labels = seq(0, 0.65, by = 0.1)) +
coord_flip(ylim = c(0, 0.5)) +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_cn_pearson_point <- ggplot_gtable(ggplot_build(plot_cn_pearson_point))
plot_cn_pearson_box <- ggplot_gtable(ggplot_build(plot_cn_pearson_box))
p_maxWidth <- grid::unit.pmax(plot_cn_pearson_point$widths[2:3], plot_cn_pearson_box$widths[2:3])
plot_cn_pearson_point$widths[2:3] <- p_maxWidth
plot_cn_pearson_box$widths[2:3] <- p_maxWidth
g2p_cn_pearson_plot <- cowplot::plot_grid(plot_cn_pearson_box, plot_cn_pearson_point, nrow = 2, rel_heights = c(1, 4))
g2p_cn_pearson_plot
# ggsave("./plots/manuscript/g2p_cn_pearson_plot.png", g2p_cn_pearson_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
plot_cn_point <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(label = Drug_Gene, fill = InG2P, color = InG2P, shape = Pearson_CN_Sign, x = Pearson_CN_Abs, y = n, size = Pearson_Pval_CN), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = pretty_breaks()) +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.3, y = 115, label = Pearson_Test_Results_CN)) +
labs(x = "Pearson correlation coefficient (r) between drug-gene AUC and copy number", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE) +
labs(shape = NULL)
plot_cn_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, y = Pearson_CN_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Pearson_CN_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
cn_pearson_plot <- subplot(plot_cn_box, plot_cn_point, nrows = 2, heights = c(0.2, 0.8), titleX = TRUE, titleY = TRUE) %>% layout(margin = list(l = 75), xaxis5 = list(title = "Pearson correlation coefficient between drug-gene AUC and copy number"), yaxis = list(title = "In G2P?", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
saveWidgetFix(widget = ge_pearson_plot, file = "./plots/manuscript/g2p_cn_pearson_plot.html")
plot_cn_spearman_point <- ggplot(data = g2p_cor, aes(x = Spearman_CN_Abs, y = n, label = Drug_Gene)) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(fill = InG2P, color = InG2P, shape = Spearman_CN_Sign), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = seq(0, 0.4, by = 0.05), labels = seq(0, 0.4, by = 0.05)) +
coord_cartesian(xlim = c(0, 0.35)) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
geom_text_repel(data = subset(g2p_cor, InG2P == "Yes" & n > 10), size = 3, segment.size = 0.2, segment.color = "grey50") +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.175, y = 115, label = Spearman_Test_Results_CN)) +
labs(x = "Spearman correlation coefficient (r_s) between drug-gene AUC and copy number", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE, shape = FALSE)
plot_cn_spearman_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, fill = InG2P, y = Spearman_CN_Abs, x = InG2P), alpha = 0.35) +
geom_boxplot(mapping = aes(color = InG2P, y = Spearman_CN_Abs, x = InG2P), width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_y_continuous(breaks = seq(0, 0.4, by = 0.05), labels = seq(0, 0.4, by = 0.05)) +
coord_flip(ylim = c(0, 0.35)) +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
plot_cn_spearman_point <- ggplot_gtable(ggplot_build(plot_cn_spearman_point))
plot_cn_spearman_box <- ggplot_gtable(ggplot_build(plot_cn_spearman_box))
p_maxWidth <- grid::unit.pmax(plot_cn_spearman_point$widths[2:3], plot_cn_spearman_box$widths[2:3])
plot_cn_spearman_point$widths[2:3] <- p_maxWidth
plot_cn_spearman_box$widths[2:3] <- p_maxWidth
g2p_cn_spearman_plot <- cowplot::plot_grid(plot_cn_spearman_box, plot_cn_spearman_point, nrow = 2, rel_heights = c(1, 4))
g2p_cn_spearman_plot
# ggsave("./plots/manuscript/g2p_cn_spearman_plot.png", g2p_cn_spearman_plot, device = "png", dpi = 450, width = 12, height = 5, units = "in")
plot_cn_point <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_point(mapping = aes(label = Drug_Gene, fill = InG2P, color = InG2P, shape = Spearman_CN_Sign, x = Spearman_CN_Abs, y = n, size = Spearman_Pval_CN), alpha = 0.5, size = 1.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_fill_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
scale_shape_manual(values = c(25, 24), labels = c("Negative", "Positive")) +
scale_y_continuous(breaks = seq(0, 120, by = 10), labels = seq(0, 120, by = 10)) +
scale_x_continuous(breaks = pretty_breaks()) +
geom_text(data = g2p_cor_text, mapping = aes(x = 0.175, y = 115, label = Spearman_Test_Results_CN)) +
labs(x = "Spearman correlation coefficient (r_s) between drug-gene AUC and copy number", y = "Number of level A G2P associations") +
theme(legend.position = "bottom", plot.margin = unit(c(0.05, 0.5, 0.5, 0.5), "cm"), strip.background = element_blank(), strip.text.x = element_blank()) +
guides(color = FALSE, fill = FALSE) +
labs(shape = NULL)
plot_cn_box <- ggplot(data = g2p_cor) +
facet_wrap(~ Dataset) +
geom_violin(mapping = aes(color = InG2P, y = Spearman_CN_Abs, x = InG2P), alpha = 0.5) +
geom_boxplot(mapping = aes(color = InG2P, y = Spearman_CN_Abs, x = InG2P), alpha = 0.5, width = 0.25) +
scale_color_manual(values = c("Yes" = "darkorchid4", "No" = "lightpink")) +
coord_flip() +
labs(x = "In G2P?") +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), plot.margin = unit(c(0.5, 0.5, 0.05, 0.5), "cm"))
cn_spearman_plot <- subplot(plot_cn_box, plot_cn_point, nrows = 2, heights = c(0.2, 0.8), titleX = TRUE, titleY = TRUE) %>% layout(margin = list(l = 75), xaxis5 = list(title = "Spearman correlation coefficient between drug-gene AUC and copy number"), yaxis = list(title = "In G2P?", domain = c(0.8, 1)), yaxis2 = list(title = "Number of level A G2P associations", domain = c(0, 0.79)))
saveWidgetFix(widget = cn_spearman_plot, file = "./plots/manuscript/g2p_cn_spearman_plot.html")
Make summary plots of mutation status tests by lineage and return a dataframe of all t-test results from comparing the p-value distributions of G2P and non-G2P drug-gene associaions:
dgl_mut_g2p_grid_ttest_res <- lapply(as.character(unique(dgl_mut_grid_summ$group_general_lineage_name)), MutPlotsByLineage)
dgl_mut_g2p_grid_ttest_res_list <- rbindlist(dgl_mut_g2p_grid_ttest_res, use.names = TRUE)
dgl_mut_g2p_grid_ttest_res_list <- dgl_mut_g2p_grid_ttest_res_list[order(dgl_mut_g2p_grid_ttest_res_list$p_value),]
saveRDS(dgl_mut_g2p_grid_ttest_res_list, "./data_munging/rds/dgl_mut_g2p_grid_ttest_res_list.rds")
dgl_mut_g2p_grid_ttest_res_list <- readRDS("./data_munging/rds/dgl_mut_g2p_grid_ttest_res_list.rds")
dgl_mut_g2p_grid_ttest_res_list_kable <- knitr::kable(dgl_mut_g2p_grid_ttest_res_list, caption = "Mutation status: T-test results for level A G2P drug-gene associations across lineages") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Make summary plots of gene expression and copy number tests by lineage and return a dataframe of all t-test results from comparing the p-value distributions of G2P and non-G2P drug-gene associaions:
dgl_pearson_g2p_grid_ttest_res <- lapply(as.character(unique(dgl_signif_pearson_g2p_grid$group_general_lineage_name)), PearsonPlotsByLineage)
dgl_pearson_g2p_grid_ttest_res_list <- rbindlist(dgl_pearson_g2p_grid_ttest_res, use.names = TRUE)
saveRDS(dgl_pearson_g2p_grid_ttest_res_list, "./data_munging/rds/dgl_pearson_g2p_grid_ttest_res_list.rds")
dgl_pearson_g2p_grid_ttest_res_list <- readRDS("./data_munging/rds/dgl_pearson_g2p_grid_ttest_res_list.rds")
dgl_ge_g2p_grid_ttest_res_list_kable <- knitr::kable(dgl_pearson_g2p_grid_ttest_res_list[order(dgl_pearson_g2p_grid_ttest_res_list$p_value_GE), c("Dataset", "group_general_lineage_name", "t_statistic_GE", "df_GE", "p_value_GE")], caption = "Gene expression: T-test results for level A G2P drug-gene/gene expression associations across lineages") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
dgl_cn_g2p_grid_ttest_res_list_kable <- knitr::kable(dgl_pearson_g2p_grid_ttest_res_list[order(dgl_pearson_g2p_grid_ttest_res_list$p_value_CN), c("Dataset", "group_general_lineage_name", "t_statistic_CN", "df_CN", "p_value_CN")], caption = "Copy number: T-test results for level A G2P drug-gene/gene expression associations across lineages") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width = "900px", height = "450px")
Figure #. Histograms of p-values from t-tests comparing distributions of summary statistics for CCLE, CTRP, and GDSC drug screen results in the context of G2P level A evidence and lineage. Each p-value is derived from comparisons similar to those shown in Figure 2, but for these p-values, the data is grouped by lineage prior to conducting the t-test rather than the test being lineage-agnostic. To produce these p-values, AUC was correlated with each genomic feature for all possible pairwise combinations of G2P drugs and genes in all lineages represented in each dataset. For each lineage, t-tests were used to compare distributions of p-values (mutation status) or Pearson correlation coefficients (gene expression and copy number) that were grouped by whether or not the underlying drug-gene combination had level-A G2P evidence. For each dataset, if there were two or more correlation results in each G2P presence/absence group in a specific lineage, a t-test was conducted and the p-value appears in the appropriate histogram. Mutation status histograms are sparse due to limited mutation annotations for many of the G2P genes in all screened cell lines.
ttest_summ_res_mut <- dgl_mut_g2p_grid_ttest_res_list[, c("p_value", "Dataset", "group_general_lineage_name")]
ttest_summ_res_mut$Metric <- "Mutation Status"
ttest_summ_res_ge <- dgl_pearson_g2p_grid_ttest_res_list[, c("p_value_GE", "Dataset", "group_general_lineage_name")]
colnames(ttest_summ_res_ge) <- c("p_value", "Dataset", "group_general_lineage_name")
ttest_summ_res_ge$Metric <- "Gene Expression"
ttest_summ_res_cn <- dgl_pearson_g2p_grid_ttest_res_list[, c("p_value_CN", "Dataset", "group_general_lineage_name")]
colnames(ttest_summ_res_cn) <- c("p_value", "Dataset", "group_general_lineage_name")
ttest_summ_res_cn$Metric <- "Copy Number"
ttest_summ_res <- rbind(ttest_summ_res_mut, ttest_summ_res_ge, ttest_summ_res_cn)
ttest_summ_res$Metric <- factor(ttest_summ_res$Metric, levels = c("Mutation Status", "Gene Expression", "Copy Number"))
ttest_mut_summ_facet <- ggplot(data = ttest_summ_res, aes(x = p_value, fill = Dataset, color = Dataset)) +
facet_grid(Metric ~ Dataset) +
geom_histogram(bins = 40, binwidth = 0.025, boundary = 0, closed = "left") +
scale_y_continuous(breaks = c(0:6), labels = c(0:6)) +
scale_x_continuous(breaks = seq(0, 1, by = 0.1), expand = c(0.01, 0.01)) +
scale_fill_manual(values = c("CCLE" = "palegreen3", "CTRP" = "slategray3", "GDSC" = "paleturquoise3")) +
scale_color_manual(values = c("CCLE" = "springgreen4", "CTRP" = "steelblue4", "GDSC" = "turquoise4")) +
theme(legend.position = c(0.5, 1.7), legend.direction = "horizontal", panel.grid.minor.y = element_blank()) +
labs(x = "P-value", y = "Frquency")
ttest_mut_summ_facet
# ggsave("./plots/manuscript/all_dgl_ttest_summ_facet_plot.png", ttest_mut_summ_facet, device = "png", dpi = 450, width = 12, height = 4, units = "in")
Mutation status boxplots for genes identified in G2P.
Boxplots ordered by gene based on frequency of G2P associations.
crispr_data_g2p$Hugo_Symbol <- factor(crispr_data_g2p$Hugo_Symbol, levels = g2p_genes)
crispr_data_g2p_color <- as.character(crispr_data_g2p$Color_Nonsilent)
names(crispr_data_g2p_color) <- crispr_data_g2p$Mutation_Status_Nonsilent
crispr_label_text <- data.frame(p.signif.adj = crispr_signif_g2p_gene$p.signif.adj, p.signif = crispr_signif_g2p_gene$p.signif, Hugo_Symbol = crispr_signif_g2p_gene$Hugo_Symbol)
crispr_label_text <- filter(crispr_label_text, Hugo_Symbol %in% g2p_genes)
crispr_label_text$Hugo_Symbol <- factor(crispr_label_text$Hugo_Symbol, levels = g2p_genes)
crispr_data_g2p_plot <- ggplot(data = crispr_data_g2p, aes(x = Mutation_Status_Nonsilent, y = Score)) +
facet_wrap(~ Hugo_Symbol, drop = FALSE, nrow = 1) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(mapping = aes(fill = Mutation_Status_Nonsilent), position = position_dodge(0.85)) +
scale_fill_manual(values = crispr_data_g2p_color) +
guides(color = FALSE) +
geom_text(data = crispr_label_text, mapping = aes(x = 1.5, y = 2, label = p.signif), nudge_y = 0.1) +
theme(strip.text.x = element_text(size = 10, angle = 45), legend.position = "top", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(title = "CRISPR", fill = "Mutation Status", y = "CERES Score", x = "Mutation Status")
crispr_data_g2p_plot
# ggsave("./plots/manuscript/crispr_data_g2p_plot.png", crispr_data_g2p_plot, device = "png", dpi = 450, width = 20, height = 5, units = "in")
Plot code:
ccle_drugs <- as.character(unique(ccle_data_ptmuts$Drug))
ccle_g2p_drugboxplots <- lapply(ccle_drugs, makeDrugBoxplots, dataset = "CCLE")
ccle_g2p_drugboxplots_paths <- paste0(ccle_drugs, "_ccle_drugboxplots.png")
pwalk(list(ccle_g2p_drugboxplots_paths, ccle_g2p_drugboxplots), ggsave, path = "./plots/manuscript/ccle_g2p_drugboxplots", dpi = 450, width = 20, height = 5, units = "in")
Data management:
ccle_g2p_drugboxplots <- paste0(list.files("./plots/manuscript/ccle_g2p_drugboxplots", full.names = TRUE))
names(ccle_g2p_drugboxplots) <- str_replace_all(ccle_g2p_drugboxplots, c("_ccle_drugboxplots.png" = "", "./plots/manuscript/ccle_g2p_drugboxplots/" = ""))
bsselect(ccle_g2p_drugboxplots, type = "img", live_search = TRUE, show_tick = TRUE, height = 100)
Plot code:
ctrp_drugs <- as.character(unique(ctrp_data_ptmuts$Drug))
ctrp_g2p_drugboxplots <- lapply(ctrp_drugs, makeDrugBoxplots, dataset = "ctrp")
ctrp_g2p_drugboxplots_paths <- paste0(ctrp_drugs, "_ctrp_drugboxplots.png")
pwalk(list(ctrp_g2p_drugboxplots_paths, ctrp_g2p_drugboxplots), ggsave, path = "./plots/manuscript/ctrp_g2p_drugboxplots", dpi = 450, width = 20, height = 5, units = "in")
Data management:
ctrp_g2p_drugboxplots <- paste0(list.files("./plots/manuscript/ctrp_g2p_drugboxplots", full.names = TRUE))
names(ctrp_g2p_drugboxplots) <- str_replace_all(ctrp_g2p_drugboxplots, c("_ctrp_drugboxplots.png" = "", "./plots/manuscript/ctrp_g2p_drugboxplots/" = ""))
bsselect(ctrp_g2p_drugboxplots, type = "img", live_search = TRUE, show_tick = TRUE, height = 100)
Plot code:
gdsc_drugs <- as.character(unique(gdsc_data_ptmuts$Drug))
gdsc_g2p_drugboxplots <- lapply(gdsc_drugs, makeDrugBoxplots, dataset = "gdsc")
gdsc_g2p_drugboxplots_paths <- paste0(gdsc_drugs, "_gdsc_drugboxplots.png")
pwalk(list(gdsc_g2p_drugboxplots_paths, gdsc_g2p_drugboxplots), ggsave, path = "./plots/manuscript/gdsc_g2p_drugboxplots", dpi = 450, width = 20, height = 5, units = "in")
Data management:
gdsc_g2p_drugboxplots <- paste0(list.files("./plots/manuscript/gdsc_g2p_drugboxplots", full.names = TRUE))
names(gdsc_g2p_drugboxplots) <- str_replace_all(gdsc_g2p_drugboxplots, c("_gdsc_drugboxplots.png" = "", "./plots/manuscript/gdsc_g2p_drugboxplots/" = ""))
bsselect(gdsc_g2p_drugboxplots, type = "img", live_search = TRUE, show_tick = TRUE, height = 100)
ccle_g2p_order <- as.character(ccle_signif_g2p$Drug_Gene)
ccle_data_g2p$Drug_Gene_Ordered <- factor(ccle_data_g2p$Drug_Gene, levels = ccle_g2p_order, labels = sapply(strsplit(x = ccle_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
ccle_signif_g2p$Drug_Gene_Ordered <- factor(ccle_signif_g2p$Drug_Gene, levels = ccle_g2p_order, labels = sapply(strsplit(x = ccle_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
ccle_data_g2p_color <- as.character(ccle_data_g2p$Color_Nonsilent)
names(ccle_data_g2p_color) <- ccle_data_g2p$Mutation_Status_Nonsilent
ccle_label_text <- data.frame(p.signif.adj = ccle_signif_g2p$p.signif.adj, p.signif = ccle_signif_g2p$p.signif, Drug_Gene_Ordered = ccle_signif_g2p$Drug_Gene_Ordered)
ccle_data_g2p_plot <- ggplot(data = ccle_data_g2p, aes(x = Mutation_Status_Nonsilent, y = AUC)) +
facet_wrap(~ Drug_Gene_Ordered, drop = FALSE, nrow = 1) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(mapping = aes(fill = Mutation_Status_Nonsilent), position = position_dodge(0.85), outlier.shape = 3, outlier.size = 0.5) +
scale_fill_manual(values = ccle_data_g2p_color) +
guides(color = FALSE) +
geom_text(data = ccle_label_text, mapping = aes(x = 1.5, y = 7, label = p.signif), nudge_y = 0.1) +
theme(legend.position = "top", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(title = "CCLE", fill = "Mutation Status", y = "AUC", x = "Mutation Status")
ccle_data_g2p_plot
# ggsave("./plots/manuscript/ccle_data_g2p_plot.png", ccle_data_g2p_plot, device = "png", dpi = 450, width = 12, height = 4, units = "in")
ctrp_g2p_order <- as.character(ctrp_signif_g2p$Drug_Gene)
ctrp_data_g2p$Drug_Gene_Ordered <- factor(ctrp_data_g2p$Drug_Gene, levels = ctrp_g2p_order, labels = sapply(strsplit(x = ctrp_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
ctrp_signif_g2p$Drug_Gene_Ordered <- factor(ctrp_signif_g2p$Drug_Gene, levels = ctrp_g2p_order, labels = sapply(strsplit(x = ctrp_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
ctrp_data_g2p_color <- as.character(ctrp_data_g2p$Color_Nonsilent)
names(ctrp_data_g2p_color) <- ctrp_data_g2p$Mutation_Status_Nonsilent
ctrp_label_text <- data.frame(p.signif.adj = ctrp_signif_g2p$p.signif.adj, p.signif = ctrp_signif_g2p$p.signif, Drug_Gene_Ordered = ctrp_signif_g2p$Drug_Gene_Ordered)
ctrp_data_g2p_plot <- ggplot(data = ctrp_data_g2p, aes(x = Mutation_Status_Nonsilent, y = AUC)) +
facet_wrap(~ Drug_Gene_Ordered, drop = FALSE, nrow = 3) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(mapping = aes(fill = Mutation_Status_Nonsilent), position = position_dodge(0.85), outlier.shape = 3, outlier.size = 0.5) +
scale_fill_manual(values = ctrp_data_g2p_color) +
guides(color = FALSE) +
geom_text(data = ctrp_label_text, mapping = aes(x = 1.5, y = 22.5, label = p.signif), nudge_y = 0.1) +
theme(legend.position = "top", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(title = "CTRP", fill = "Mutation Status", y = "AUC", x = "Mutation Status")
ctrp_data_g2p_plot
# ggsave("./plots/manuscript/ctrp_data_g2p_plot.png", ctrp_data_g2p_plot, device = "png", dpi = 450, width = 12, height = 7, units = "in")
gdsc_g2p_order <- as.character(gdsc_signif_g2p$Drug_Gene)
gdsc_data_g2p$Drug_Gene_Ordered <- factor(gdsc_data_g2p$Drug_Gene, levels = gdsc_g2p_order, labels = sapply(strsplit(x = gdsc_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
gdsc_signif_g2p$Drug_Gene_Ordered <- factor(gdsc_signif_g2p$Drug_Gene, levels = gdsc_g2p_order, labels = sapply(strsplit(x = gdsc_g2p_order, split = "_"), function(x) paste0(x[1], "\n", x[2])))
gdsc_data_g2p_color <- as.character(gdsc_data_g2p$Color_Nonsilent)
names(gdsc_data_g2p_color) <- gdsc_data_g2p$Mutation_Status_Nonsilent
gdsc_label_text <- data.frame(p.signif.adj = gdsc_signif_g2p$p.signif.adj, p.signif = gdsc_signif_g2p$p.signif, Drug_Gene_Ordered = gdsc_signif_g2p$Drug_Gene_Ordered)
gdsc_data_g2p_plot <- ggplot(data = gdsc_data_g2p, aes(x = Mutation_Status_Nonsilent, y = AUC)) +
facet_wrap(~ Drug_Gene_Ordered, drop = FALSE, nrow = 3) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(mapping = aes(fill = Mutation_Status_Nonsilent), position = position_dodge(0.85), outlier.shape = 3, outlier.size = 0.5) +
scale_fill_manual(values = gdsc_data_g2p_color) +
guides(color = FALSE) +
geom_text(data = gdsc_label_text, mapping = aes(x = 1.5, y = 1, label = p.signif), nudge_y = 0.1) +
theme(legend.position = "top", axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(title = "GDSC", fill = "Mutation Status", y = "AUC", x = "Mutation Status")
gdsc_data_g2p_plot
# ggsave("./plots/manuscript/gdsc_data_g2p_plot.png", gdsc_data_g2p_plot, device = "png", dpi = 450, width = 12, height = 7, units = "in")
Match specific point mutations.
g2p_indications <- filter(read.delim("./data_munging/data_mutation_associations_appended.csv", sep = "\t", header = TRUE), Evidence.Level == "A")
maf_g2p_indications <- filter(maf_raw, Genome_Change %in% g2p_indications$MutationName)
crispr_g2p_indications <- merge(crispr_data_ptmuts, maf_g2p_indications, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
crispr_g2p_indications$Mutation_Status_Nonsilent <- ifelse(is.na(crispr_g2p_indications$Mutation_Status_Nonsilent), "Wildtype", crispr_g2p_indications$Mutation_Status_Nonsilent)
dup_g2p_indications <- filter(crispr_g2p_indications[, c("Hugo_Symbol", "CCLE_Name", "DepMap_ID")] %>% group_by(Hugo_Symbol, CCLE_Name, DepMap_ID) %>% tally(), n > 1)
crispr_g2p_indications <- merge(crispr_g2p_indications, dup_g2p_indications, by = c("Hugo_Symbol", "CCLE_Name", "DepMap_ID"), all.x = TRUE)
crispr_g2p_indications$Genome_Change <- factor(crispr_g2p_indications$Genome_Change, levels = unique(crispr_g2p_indications$Genome_Change))
crispr_g2p_indications_nonNA <- filter(crispr_g2p_indications, is.na(n))
crispr_g2p_indications_plot <- ggplot(crispr_g2p_indications, aes(x = Hugo_Symbol, y = Score)) +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(alpha = 0.5, color = "lightgray", outlier.shape = NA) +
geom_jitter(width = 0.3, mapping = aes(color = Mutation_Status_Nonsilent), size = 1) +
theme_light() +
theme(legend.title = element_blank()) +
labs(x = "Gene", y = "CERES Score")
crispr_g2p_indications_plot
crispr_g2p_indications_muts_plot <- ggplot(crispr_g2p_indications, aes(x = Protein_Change, y = Score)) +
facet_grid(~ Hugo_Symbol, scales = "free_x", space = "free") +
geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
geom_boxplot(alpha = 0.5, color = "lightgray", outlier.shape = NA) +
geom_jitter(width = 0.1, mapping = aes(color = Mutation_Status_Nonsilent), size = 1) +
theme_light() +
theme(legend.title = element_blank(), axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Gene", y = "CERES Score")
crispr_g2p_indications_muts_plot
# ggsave("./plots/manuscript/crispr_g2p_indications_plot_test.png", crispr_g2p_indications_plot, width = 12, height = 5, units = "in")
# ggsave("./plots/manuscript/crispr_g2p_indications_muts_plot_test.png", crispr_g2p_indications_muts_plot, width = 20, height = 5, units = "in")
Barretina, J., Caponigro, G., Stransky, N., Venkatesan, K., Margolin, A. A., Kim, S., … Garraway, L. A. (2012). The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity. Nature, 483(7391), 603–607. https://doi.org/10.1038/nature11003
Broad Institute Cancer Dependency Map; Cancer Data Science (2018): Cancer Dependency Map, CRISPR Avana dataset 18Q3 (Avana_public_18Q3). figshare. Fileset. doi:10.6084/m9.figshare.6931364.v1
Consortium, T. C. C. L. E., & Consortium, T. G. of D. S. in C. (2015). Pharmacogenomic agreement between two cancer cell line data sets. Nature, 528(7580), 84–87. https://doi.org/10.1038/nature15736
Data Science, Cancer (2018): DEMETER2 data. figshare. Fileset. doi:10.6084/m9.figshare.6025238.v2
Doench, J. G., Fusi, N., Sullender, M., Hegde, M., Vaimberg, E. W., Donovan, K. F., … Root, D. E. (2016). Optimized sgRNA design to maximize activity and minimize off-target effects of CRISPR-Cas9. Nature Biotechnology, 34(2), 184–191. https://doi.org/10.1038/nbt.3437
Meyers, R. M., Bryan, J. G., McFarland, J. M., Weir, B. A., Sizemore, A. E., Xu, H., … Tsherniak, A. (2017). Computational correction of copy-number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics, 49(12), 1779–1784. https://doi.org/10.1038/ng.3984
McFarland, J. M., Ho, Z. V., Kugener, G., Dempster, J. M., Montgomery, P. G., Bryan, J. G., … Tsherniak, A. (2018). Improved estimation of cancer dependencies from large-scale RNAi screens using model-based normalization and data integration. https://doi.org/10.1101/305656
print(sessionInfo())
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] bindrcpp_0.2.2 VennDiagram_1.6.20 futile.logger_1.4.3
## [4] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [7] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2
## [10] tibble_1.4.2 tidyverse_1.2.1 scales_1.0.0
## [13] reshape2_1.4.3 plotly_4.8.0 papaja_0.1.0.9842
## [16] kableExtra_0.9.0 htmlwidgets_1.2 gridExtra_2.3
## [19] gplots_3.0.1 ggsignif_0.4.0 ggrepel_0.8.0
## [22] ggpubr_0.1.7.999 magrittr_1.5 data.table_1.11.8
## [25] cowplot_0.9.3 ggplot2_3.0.0 bsselectR_0.1.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.0 jsonlite_1.6 viridisLite_0.3.0
## [4] modelr_0.1.2 gtools_3.8.1 assertthat_0.2.0
## [7] highr_0.7 cellranger_1.1.0 yaml_2.2.0
## [10] pillar_1.3.0 backports_1.1.3 lattice_0.20-35
## [13] glue_1.3.0 digest_0.6.18 rvest_0.3.2
## [16] colorspace_1.3-2 htmltools_0.3.6 plyr_1.8.4
## [19] pkgconfig_2.0.2 broom_0.5.1 haven_1.1.2
## [22] gdata_2.18.0 generics_0.0.2 withr_2.1.2
## [25] lazyeval_0.2.1 cli_1.0.1 crayon_1.3.4
## [28] readxl_1.1.0 evaluate_0.12 fansi_0.2.3
## [31] nlme_3.1-137 xml2_1.2.0 tools_3.5.0
## [34] hms_0.4.2 formatR_1.5 munsell_0.5.0
## [37] lambda.r_1.2.3 compiler_3.5.0 caTools_1.17.1.1
## [40] rlang_0.3.0.1 rstudioapi_0.8 labeling_0.3
## [43] bitops_1.0-6 rmarkdown_1.11 gtable_0.2.0
## [46] R6_2.3.0 lubridate_1.7.4 knitr_1.21
## [49] utf8_1.1.4 bindr_0.1.1 futile.options_1.0.1
## [52] KernSmooth_2.23-15 stringi_1.2.4 parallel_3.5.0
## [55] Rcpp_1.0.0 tidyselect_0.2.5 xfun_0.4